From 563ae6ef14635c8f4777d7f7089f545520b86265 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Wed, 5 Jun 2024 22:35:13 +0000
Subject: [PATCH 01/27] initial layout

---
 src/simulators/matrix_product_state/svd.cpp | 277 +++++++++++++++++++-
 1 file changed, 275 insertions(+), 2 deletions(-)
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 150d97af4f..7d886cae2e 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -28,6 +28,37 @@
 #include <stdlib.h>
 
 namespace AER {
+
+
+#ifdef AER_THRUST_CUDA
+
+#include <cuda.h>                                  
+#include <cuda_runtime.h>                          
+#include <cutensornet.h>
+#include <vector>
+#include <cassert>
+
+
+#define HANDLE_ERROR(x)                                           \
+{ const auto err = x;                                             \
+if( err != CUTENSORNET_STATUS_SUCCESS )                           \
+{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
+};
+
+#define HANDLE_CUDA_ERROR(x)                                      \
+{  const auto err = x;                                            \
+   if( err != cudaSuccess )                                       \
+   { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
+};
+
+
+#endif // AER_THRUST_CUDA
+
+
+
+
+
+
 // default values
 constexpr auto mul_factor = 1e2;
 constexpr long double tiny_factor = 1e30;
@@ -552,9 +583,23 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) {
 void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
                   bool lapack) {
   if (lapack) {
-    lapack_csvd_wrapper(A, U, S, V);
+
+	#ifdef AER_THRUST_CUDA
+	cutensor_csvd_wrapper(A, U, S, V);
+	#endif // AER_THRUST_CUDA
+	
+	#ifndef AER_THRUST_CUDA
+	lapack_csvd_wrapper(A, U, S, V);
+	#endif // AER_THRUST_CUDA
   } else {
-    qiskit_csvd_wrapper(A, U, S, V);
+
+	  #ifdef AER_THRUST_CUDA
+	  cutensor_csvd_wrapper(A, U, S, V);
+	  #endif // AER_THRUST_CUDA
+	
+	  #ifndef AER_THRUST_CUDA
+      	  qiskit_csvd_wrapper(A, U, S, V);
+	  #endif // AER_THRUST_CUDA
   }
 }
 
@@ -667,4 +712,232 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   }
 }
 
+#ifdef AER_THRUST_CUDA
+void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) 
+{
+
+   const size_t cuTensornetVersion = cutensornetGetVersion();
+
+   cudaDeviceProp prop;
+   int deviceId{-1};
+   HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
+   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
+
+   typedef float floatType;
+   cudaDataType_t typeData = CUDA_R_32F;
+
+   std::vector<int32_t> modesT{'i', 'j'}; // input
+   std::vector<int32_t> modesU{'i', 'm'};
+   std::vector<int32_t> modesV{'n', 'j'};  // SVD output
+
+
+   size_t elementsT  = 160000;
+   size_t elementsU = 160000;
+   size_t elementsS = 400;
+   size_t elementsV = 160000;
+
+   size_t sizeT = sizeof(floatType) * elementsT;
+   size_t sizeU = sizeof(floatType) * elementsU;
+   size_t sizeS = sizeof(floatType) * elementsS;
+   size_t sizeV = sizeof(floatType) * elementsS;
+
+   floatType *T = (floatType*) malloc(sizeT);
+   floatType *U = (floatType*) malloc(sizeU);
+   floatType *S = (floatType*) malloc(sizeS);
+   floatType *V = (floatType*) malloc(sizeV);
+
+
+   void* D_T;
+   void* D_U;
+   void* D_S;
+   void* D_V;
+
+   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
+   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) );
+   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) );
+   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) );
+
+   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
+
+   cudaStream_t stream;
+   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
+
+   cutensornetHandle_t handle;
+   HANDLE_ERROR( cutensornetCreate(&handle) );
+
+   cutensornetTensorDescriptor_t descTensorIn;
+   cutensornetTensorDescriptor_t descTensorU;
+   cutensornetTensorDescriptor_t descTensorV;
+
+   const int32_t numModesIn = modesT.size();
+   const int32_t numModesU = modesU.size();
+   const int32_t numModesV = modesV.size();
+
+   std::vector<int64_t> extentT{400, 400}; // shape of T
+   std::vector<int64_t> extentU{400, 400}; // shape of U
+   std::vector<int64_t> extentS{400}; // shape of S
+   std::vector<int64_t> extentV{400, 400}; // shape of V
+
+   const int64_t* strides = NULL; // assuming fortran layout for all tensors
+
+   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
+   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
+   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
+
+   cutensornetTensorSVDConfig_t svdConfig;
+   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
+
+   // set up truncation parameters
+   double absCutoff = 1e-2;
+   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
+                                          svdConfig,
+                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF,
+                                          &absCutoff,
+                                          sizeof(absCutoff)) );
+   double relCutoff = 4e-2;
+   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
+                                          svdConfig,
+                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF,
+                                          &relCutoff,
+                                          sizeof(relCutoff)) );
+
+   // optional: choose gesvdj algorithm with customized parameters. Default is gesvd.
+   cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
+   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
+                                          svdConfig,
+                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO,
+                                          &svdAlgo,
+                                          sizeof(svdAlgo)) );
+   cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
+   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
+                                          svdConfig,
+                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
+                                          &gesvdjParams,
+                                          sizeof(gesvdjParams)) );
+
+   /********************************************************
+   * Create SVDInfo to record runtime SVD truncation details
+   *********************************************************/
+
+   cutensornetTensorSVDInfo_t svdInfo;
+   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;
+
+   /**************************************************************
+   * Query the required workspace sizes and allocate memory
+   **************************************************************/
+
+   cutensornetWorkspaceDescriptor_t workDesc;
+   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
+   HANDLE_ERROR( cutensornetWorkspaceComputeSVDSizes(handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc) );
+   int64_t hostWorkspaceSize, deviceWorkspaceSize;
+   // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
+   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
+                                                   workDesc,
+                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+                                                   CUTENSORNET_MEMSPACE_DEVICE,
+                                                   CUTENSORNET_WORKSPACE_SCRATCH,
+                                                   &deviceWorkspaceSize) );
+   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
+                                                   workDesc,
+                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+                                                   CUTENSORNET_MEMSPACE_HOST,
+                                                   CUTENSORNET_WORKSPACE_SCRATCH,
+                                                   &hostWorkspaceSize) );
+
+   void *devWork = nullptr, *hostWork = nullptr;
+   if (deviceWorkspaceSize > 0) {
+      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
+   }
+   if (hostWorkspaceSize > 0) {
+      hostWork = malloc(hostWorkspaceSize);
+   }
+   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
+                                               workDesc,
+                                               CUTENSORNET_MEMSPACE_DEVICE,
+                                               CUTENSORNET_WORKSPACE_SCRATCH,
+                                               devWork,
+                                               deviceWorkspaceSize) );
+   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
+                                               workDesc,
+                                               CUTENSORNET_MEMSPACE_HOST,
+                                               CUTENSORNET_WORKSPACE_SCRATCH,
+                                               hostWork,
+                                               hostWorkspaceSize) );
+
+   /**********
+   * Execution
+   ***********/
+
+   const int numRuns = 3; // to get stable perf results
+   for (int i=0; i < numRuns; ++i)
+   {
+      // restore output
+      cudaMemsetAsync(D_U, 0, sizeU, stream);
+      cudaMemsetAsync(D_S, 0, sizeS, stream);
+      cudaMemsetAsync(D_V, 0, sizeV, stream);
+      cudaDeviceSynchronize();
+
+      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
+      // We here restore descTensorU/V to the original problem.
+      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
+      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
+      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
+      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
+
+      HANDLE_ERROR( cutensornetTensorSVD(handle,
+                        descTensorIn, D_T,
+                        descTensorU, D_U,
+                        D_S,
+                        descTensorV, D_V,
+                        svdConfig,
+                        svdInfo,
+                        workDesc,
+                        stream) );
+      // Synchronize and measure timing
+   }
+
+
+   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
+   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
+   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );
+
+   /*************************************
+   * Query runtime truncation information
+   **************************************/
+
+   double discardedWeight{0};
+   int64_t reducedExtent{0};
+   cutensornetGesvdjStatus_t gesvdjStatus;
+   cudaDeviceSynchronize(); // device synchronization.
+   HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, &discardedWeight, sizeof(discardedWeight)) );
+   HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, &reducedExtent, sizeof(reducedExtent)) );
+   HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, sizeof(gesvdjStatus)) );
+
+   /***************
+   * Free resources
+   ****************/
+
+   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
+   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
+   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
+   HANDLE_ERROR( cutensornetDestroyTensorSVDConfig(svdConfig) );
+   HANDLE_ERROR( cutensornetDestroyTensorSVDInfo(svdInfo) );
+   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
+   HANDLE_ERROR( cutensornetDestroy(handle) );
+
+   if (T) free(T);
+   if (U) free(U);
+   if (S) free(S);
+   if (V) free(V);
+   if (D_T) cudaFree(D_T);
+   if (D_U) cudaFree(D_U);
+   if (D_S) cudaFree(D_S);
+   if (D_V) cudaFree(D_V);
+   if (devWork) cudaFree(devWork);
+   if (hostWork) free(hostWork);
+
+}
+#endif // AER_THRUST_CUDA
+
+
 } // namespace AER

From 280f868a201cf9fa0c32dd978a6e630cb8888794 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Wed, 5 Jun 2024 23:05:05 +0000
Subject: [PATCH 02/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 57 ++++++++++-----------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 7d886cae2e..ff16793cd6 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -39,17 +39,26 @@ namespace AER {
 #include <cassert>
 
 
-#define HANDLE_ERROR(x)                                           \
-{ const auto err = x;                                             \
-if( err != CUTENSORNET_STATUS_SUCCESS )                           \
-{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
-};
-
-#define HANDLE_CUDA_ERROR(x)                                      \
-{  const auto err = x;                                            \
-   if( err != cudaSuccess )                                       \
-   { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
-};
+#define HANDLE_ERROR(x)                                                        \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : "                                  \
+          << cutensornetGetErrorString(err);                                   \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
 
 
 #endif // AER_THRUST_CUDA
@@ -712,7 +721,7 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   }
 }
 
-#ifdef AER_THRUST_CUDA
+//#ifdef AER_THRUST_CUDA
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) 
 {
 
@@ -736,28 +745,22 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    size_t elementsS = 400;
    size_t elementsV = 160000;
 
-   size_t sizeT = sizeof(floatType) * elementsT;
-   size_t sizeU = sizeof(floatType) * elementsU;
-   size_t sizeS = sizeof(floatType) * elementsS;
-   size_t sizeV = sizeof(floatType) * elementsS;
-
-   floatType *T = (floatType*) malloc(sizeT);
-   floatType *U = (floatType*) malloc(sizeU);
-   floatType *S = (floatType*) malloc(sizeS);
-   floatType *V = (floatType*) malloc(sizeV);
-
+   size_t sizeA = sizeof(A);
+   size_t sizeU = sizeof(U);
+   size_t sizeS = sizeof(S);
+   size_t sizeV = sizeof(V);
 
    void* D_T;
    void* D_U;
    void* D_S;
    void* D_V;
 
-   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
+   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeA) );
    HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) );
    HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) );
    HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) );
 
-   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
+   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, A, sizeA, cudaMemcpyHostToDevice) );
 
    cudaStream_t stream;
    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
@@ -925,10 +928,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
    HANDLE_ERROR( cutensornetDestroy(handle) );
 
-   if (T) free(T);
-   if (U) free(U);
-   if (S) free(S);
-   if (V) free(V);
    if (D_T) cudaFree(D_T);
    if (D_U) cudaFree(D_U);
    if (D_S) cudaFree(D_S);
@@ -937,7 +936,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    if (hostWork) free(hostWork);
 
 }
-#endif // AER_THRUST_CUDA
+//#endif // AER_THRUST_CUDA
 
 
 } // namespace AER

From ae44c69f2623cc0ade2563e6987ed652b23f9db9 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 05:06:06 +0000
Subject: [PATCH 03/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 31 +++++++++++++++++----
 src/simulators/matrix_product_state/svd.hpp |  3 ++
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index ff16793cd6..7db0ef108d 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -724,8 +724,17 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 //#ifdef AER_THRUST_CUDA
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) 
 {
-
-   const size_t cuTensornetVersion = cutensornetGetVersion();
+	const size_t m = A.GetRows(), n = A.GetColumns();
+      	const size_t min_dim = std::min(m, n);
+      	const size_t lda = std::max(m, n);
+      	size_t lwork = 2 * min_dim + lda;
+      
+	U.resize(m, m);
+      	V.resize(n, n);
+
+      	complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer();
+     
+	const size_t cuTensornetVersion = cutensornetGetVersion();
 
    cudaDeviceProp prop;
    int deviceId{-1};
@@ -750,6 +759,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    size_t sizeS = sizeof(S);
    size_t sizeV = sizeof(V);
 
+   double *cutensor_S = (double*)malloc(sizeof(S));
+
    void* D_T;
    void* D_U;
    void* D_S;
@@ -760,7 +771,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) );
    HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) );
 
-   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, A, sizeA, cudaMemcpyHostToDevice) );
+   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice) );
 
    cudaStream_t stream;
    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
@@ -900,9 +911,17 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    }
 
 
-   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
-   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
-   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );
+   HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost) );
+   HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost) );
+   HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost) );
+
+   S.clear();
+   for (int i = 0; i < min_dim; i++)
+	   S.push_back(cutensor_S[i]);
+
+   A = cmatrix_t::move_from_buffer(m, n, cutensor_A);
+   U = cmatrix_t::move_from_buffer(m, m, cutensor_U);
+   V = cmatrix_t::move_from_buffer(n, n, cutensor_V);
 
    /*************************************
    * Query runtime truncation information
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index fac77797fb..c35a9c4406 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -49,6 +49,9 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
 // Lapack call
 void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
                          cmatrix_t &V);
+// cutensor call
+void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
+			cmatrix_t &V);
 
 void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
                          const rvector_t &S, const cmatrix_t &V);

From 5b4826551a5623ebc8dae11917b25e7806c50ab9 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 05:43:10 +0000
Subject: [PATCH 04/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 28 ++++++++-------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 7db0ef108d..2b73212f1f 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -724,10 +724,10 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 //#ifdef AER_THRUST_CUDA
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) 
 {
-	const size_t m = A.GetRows(), n = A.GetColumns();
-      	const size_t min_dim = std::min(m, n);
-      	const size_t lda = std::max(m, n);
-      	size_t lwork = 2 * min_dim + lda;
+	const int64_t m = A.GetRows(), n = A.GetColumns();
+      	const int64_t min_dim = std::min(m, n);
+      	const int64_t lda = std::max(m, n);
+      	int64_t lwork = 2 * min_dim + lda;
       
 	U.resize(m, m);
       	V.resize(n, n);
@@ -744,15 +744,9 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    typedef float floatType;
    cudaDataType_t typeData = CUDA_R_32F;
 
-   std::vector<int32_t> modesT{'i', 'j'}; // input
-   std::vector<int32_t> modesU{'i', 'm'};
-   std::vector<int32_t> modesV{'n', 'j'};  // SVD output
-
-
-   size_t elementsT  = 160000;
-   size_t elementsU = 160000;
-   size_t elementsS = 400;
-   size_t elementsV = 160000;
+   std::vector<int32_t> modesT{'m', 'n'}; // input
+   std::vector<int32_t> modesU{'m', 'm'};
+   std::vector<int32_t> modesV{'n', 'n'};  // SVD output
 
    size_t sizeA = sizeof(A);
    size_t sizeU = sizeof(U);
@@ -787,10 +781,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    const int32_t numModesU = modesU.size();
    const int32_t numModesV = modesV.size();
 
-   std::vector<int64_t> extentT{400, 400}; // shape of T
-   std::vector<int64_t> extentU{400, 400}; // shape of U
-   std::vector<int64_t> extentS{400}; // shape of S
-   std::vector<int64_t> extentV{400, 400}; // shape of V
+   std::vector<int64_t> extentT{m, n}; // shape of T
+   std::vector<int64_t> extentU{m, m}; // shape of U
+   std::vector<int64_t> extentS{n}; // shape of S
+   std::vector<int64_t> extentV{n, n}; // shape of V
 
    const int64_t* strides = NULL; // assuming fortran layout for all tensors
 

From 517a5548541468e5a81889f1d8392097b9119a1e Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 06:21:25 +0000
Subject: [PATCH 05/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 2b73212f1f..1c60d708b7 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -742,11 +742,11 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 
    typedef float floatType;
-   cudaDataType_t typeData = CUDA_R_32F;
+   cudaDataType_t typeData = CUDA_C_64F;
 
-   std::vector<int32_t> modesT{'m', 'n'}; // input
-   std::vector<int32_t> modesU{'m', 'm'};
-   std::vector<int32_t> modesV{'n', 'n'};  // SVD output
+   std::vector<int32_t> modesT{'a', 'b'}; // input
+   std::vector<int32_t> modesU{'c', 'd'};
+   std::vector<int32_t> modesV{'e', 'c'};  // SVD output
 
    size_t sizeA = sizeof(A);
    size_t sizeU = sizeof(U);

From 7e40588b4b8f21dbff6876702a6d7721609118aa Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 08:23:35 +0000
Subject: [PATCH 06/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 1c60d708b7..4be447ea37 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -782,15 +782,18 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
    const int32_t numModesV = modesV.size();
 
    std::vector<int64_t> extentT{m, n}; // shape of T
-   std::vector<int64_t> extentU{m, m}; // shape of U
-   std::vector<int64_t> extentS{n}; // shape of S
-   std::vector<int64_t> extentV{n, n}; // shape of V
+   std::vector<int64_t> extentU{lda, lda}; // shape of U
+   std::vector<int64_t> extentS{lda}; // shape of S
+   std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
 
-   const int64_t* strides = NULL; // assuming fortran layout for all tensors
+   std::vector<int64_t> stridesT{n, 1};
+   std::vector<int64_t> stridesU{lda, 1};
+   std::vector<int64_t> stridesV{min_dim, 1};
 
-   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
-   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
-   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
+
+   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), stridesT.data(), modesT.data(), typeData, &descTensorIn) );
+   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) );
+   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) );
 
    cutensornetTensorSVDConfig_t svdConfig;
    HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
@@ -889,8 +892,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &
       // We here restore descTensorU/V to the original problem.
       HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
       HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
-      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
-      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
+      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) );
+      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) );
 
       HANDLE_ERROR( cutensornetTensorSVD(handle,
                         descTensorIn, D_T,

From ebf9ca09e1ae4a9b3e704199705609e1cc17e7f9 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 09:32:44 +0000
Subject: [PATCH 07/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 478 ++++++++++----------
 1 file changed, 237 insertions(+), 241 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 4be447ea37..6c7d432df5 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -29,15 +29,13 @@
 
 namespace AER {
 
-
 #ifdef AER_THRUST_CUDA
 
-#include <cuda.h>                                  
-#include <cuda_runtime.h>                          
+#include <cassert>
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <cutensornet.h>
 #include <vector>
-#include <cassert>
-
 
 #define HANDLE_ERROR(x)                                                        \
   {                                                                            \
@@ -60,14 +58,8 @@ namespace AER {
     }                                                                          \
   };
 
-
 #endif // AER_THRUST_CUDA
 
-
-
-
-
-
 // default values
 constexpr auto mul_factor = 1e2;
 constexpr long double tiny_factor = 1e30;
@@ -593,22 +585,22 @@ void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
                   bool lapack) {
   if (lapack) {
 
-	#ifdef AER_THRUST_CUDA
-	cutensor_csvd_wrapper(A, U, S, V);
-	#endif // AER_THRUST_CUDA
-	
-	#ifndef AER_THRUST_CUDA
-	lapack_csvd_wrapper(A, U, S, V);
-	#endif // AER_THRUST_CUDA
+#ifdef AER_THRUST_CUDA
+    cutensor_csvd_wrapper(A, U, S, V);
+#endif // AER_THRUST_CUDA
+
+#ifndef AER_THRUST_CUDA
+    lapack_csvd_wrapper(A, U, S, V);
+#endif // AER_THRUST_CUDA
   } else {
 
-	  #ifdef AER_THRUST_CUDA
-	  cutensor_csvd_wrapper(A, U, S, V);
-	  #endif // AER_THRUST_CUDA
-	
-	  #ifndef AER_THRUST_CUDA
-      	  qiskit_csvd_wrapper(A, U, S, V);
-	  #endif // AER_THRUST_CUDA
+#ifdef AER_THRUST_CUDA
+    cutensor_csvd_wrapper(A, U, S, V);
+#endif // AER_THRUST_CUDA
+
+#ifndef AER_THRUST_CUDA
+    qiskit_csvd_wrapper(A, U, S, V);
+#endif // AER_THRUST_CUDA
   }
 }
 
@@ -721,238 +713,242 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   }
 }
 
-//#ifdef AER_THRUST_CUDA
-void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) 
-{
-	const int64_t m = A.GetRows(), n = A.GetColumns();
-      	const int64_t min_dim = std::min(m, n);
-      	const int64_t lda = std::max(m, n);
-      	int64_t lwork = 2 * min_dim + lda;
-      
-	U.resize(m, m);
-      	V.resize(n, n);
-
-      	complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer();
-     
-	const size_t cuTensornetVersion = cutensornetGetVersion();
-
-   cudaDeviceProp prop;
-   int deviceId{-1};
-   HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
-   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
-
-   typedef float floatType;
-   cudaDataType_t typeData = CUDA_C_64F;
-
-   std::vector<int32_t> modesT{'a', 'b'}; // input
-   std::vector<int32_t> modesU{'c', 'd'};
-   std::vector<int32_t> modesV{'e', 'c'};  // SVD output
-
-   size_t sizeA = sizeof(A);
-   size_t sizeU = sizeof(U);
-   size_t sizeS = sizeof(S);
-   size_t sizeV = sizeof(V);
-
-   double *cutensor_S = (double*)malloc(sizeof(S));
-
-   void* D_T;
-   void* D_U;
-   void* D_S;
-   void* D_V;
-
-   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeA) );
-   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) );
-   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) );
-   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) );
-
-   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice) );
-
-   cudaStream_t stream;
-   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
-
-   cutensornetHandle_t handle;
-   HANDLE_ERROR( cutensornetCreate(&handle) );
-
-   cutensornetTensorDescriptor_t descTensorIn;
-   cutensornetTensorDescriptor_t descTensorU;
-   cutensornetTensorDescriptor_t descTensorV;
-
-   const int32_t numModesIn = modesT.size();
-   const int32_t numModesU = modesU.size();
-   const int32_t numModesV = modesV.size();
-
-   std::vector<int64_t> extentT{m, n}; // shape of T
-   std::vector<int64_t> extentU{lda, lda}; // shape of U
-   std::vector<int64_t> extentS{lda}; // shape of S
-   std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
-
-   std::vector<int64_t> stridesT{n, 1};
-   std::vector<int64_t> stridesU{lda, 1};
-   std::vector<int64_t> stridesV{min_dim, 1};
-
-
-   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), stridesT.data(), modesT.data(), typeData, &descTensorIn) );
-   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) );
-   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) );
-
-   cutensornetTensorSVDConfig_t svdConfig;
-   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
-
-   // set up truncation parameters
-   double absCutoff = 1e-2;
-   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
-                                          svdConfig,
-                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF,
-                                          &absCutoff,
-                                          sizeof(absCutoff)) );
-   double relCutoff = 4e-2;
-   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
-                                          svdConfig,
-                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF,
-                                          &relCutoff,
-                                          sizeof(relCutoff)) );
-
-   // optional: choose gesvdj algorithm with customized parameters. Default is gesvd.
-   cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
-   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
-                                          svdConfig,
-                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO,
-                                          &svdAlgo,
-                                          sizeof(svdAlgo)) );
-   cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
-   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
-                                          svdConfig,
-                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
-                                          &gesvdjParams,
-                                          sizeof(gesvdjParams)) );
-
-   /********************************************************
+// #ifdef AER_THRUST_CUDA
+void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
+                           cmatrix_t &V) {
+  const int64_t m = A.GetRows(), n = A.GetColumns();
+  const int64_t min_dim = std::min(m, n);
+  const int64_t lda = std::max(m, n);
+  int64_t lwork = 2 * min_dim + lda;
+
+  // Transpose when m < n
+  bool transposed = false;
+  if (m < n) {
+    transposed = true;
+    A = AER::Utils::dagger(A);
+  }
+
+  complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
+            *cutensor_V = V.move_to_buffer();
+
+  const size_t cuTensornetVersion = cutensornetGetVersion();
+
+  cudaDeviceProp prop;
+  int deviceId{-1};
+  HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
+  HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+
+  typedef float floatType;
+  cudaDataType_t typeData = CUDA_C_64F;
+
+  std::vector<int32_t> modesA{'a', 'b'}; // input
+  std::vector<int32_t> modesU{'c', 'd'};
+  std::vector<int32_t> modesV{'e', 'c'}; // SVD output
+
+  size_t sizeA = sizeof(A);
+  size_t sizeU = sizeof(U);
+  size_t sizeS = sizeof(S);
+  size_t sizeV = sizeof(V);
+
+  double *cutensor_S = (double *)malloc(sizeof(S));
+
+  void *D_T;
+  void *D_U;
+  void *D_S;
+  void *D_V;
+
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_T, sizeA));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV));
+
+  HANDLE_CUDA_ERROR(cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice));
+
+  cudaStream_t stream;
+  HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
+
+  cutensornetHandle_t handle;
+  HANDLE_ERROR(cutensornetCreate(&handle));
+
+  cutensornetTensorDescriptor_t descTensorIn;
+  cutensornetTensorDescriptor_t descTensorU;
+  cutensornetTensorDescriptor_t descTensorV;
+
+  const int32_t numModesA = modesA.size();
+  const int32_t numModesU = modesU.size();
+  const int32_t numModesV = modesV.size();
+
+  std::vector<int64_t> extentA{lda, min_dim};             // shape of A
+  std::vector<int64_t> extentU{lda, lda};         // shape of U
+  std::vector<int64_t> extentS{min_dim};          // shape of S
+  std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
+
+  std::vector<int64_t> stridesA{n, 1};
+  std::vector<int64_t> stridesU{lda, 1};
+  std::vector<int64_t> stridesV{min_dim, 1};
+
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesA, extentA.data(), stridesA.data(), modesA.data(),
+      typeData, &descTensorIn));
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesU, extentU.data(), stridesU.data(), modesU.data(),
+      typeData, &descTensorU));
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesV, extentV.data(), stridesV.data(), modesV.data(),
+      typeData, &descTensorV));
+
+  cutensornetTensorSVDConfig_t svdConfig;
+  HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig));
+
+  // set up truncation parameters
+  double absCutoff = 1e-2;
+  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
+      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff,
+      sizeof(absCutoff)));
+  double relCutoff = 4e-2;
+  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
+      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff,
+      sizeof(relCutoff)));
+
+  // optional: choose gesvdj algorithm with customized parameters. Default is
+  // gesvd.
+  cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
+  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
+      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, &svdAlgo,
+      sizeof(svdAlgo)));
+  cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
+  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
+      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
+      &gesvdjParams, sizeof(gesvdjParams)));
+
+  /********************************************************
    * Create SVDInfo to record runtime SVD truncation details
    *********************************************************/
 
-   cutensornetTensorSVDInfo_t svdInfo;
-   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;
+  cutensornetTensorSVDInfo_t svdInfo;
+  HANDLE_ERROR(cutensornetCreateTensorSVDInfo(handle, &svdInfo));
 
-   /**************************************************************
+  /**************************************************************
    * Query the required workspace sizes and allocate memory
    **************************************************************/
 
-   cutensornetWorkspaceDescriptor_t workDesc;
-   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
-   HANDLE_ERROR( cutensornetWorkspaceComputeSVDSizes(handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc) );
-   int64_t hostWorkspaceSize, deviceWorkspaceSize;
-   // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
-   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
-                                                   workDesc,
-                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
-                                                   CUTENSORNET_MEMSPACE_DEVICE,
-                                                   CUTENSORNET_WORKSPACE_SCRATCH,
-                                                   &deviceWorkspaceSize) );
-   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
-                                                   workDesc,
-                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
-                                                   CUTENSORNET_MEMSPACE_HOST,
-                                                   CUTENSORNET_WORKSPACE_SCRATCH,
-                                                   &hostWorkspaceSize) );
-
-   void *devWork = nullptr, *hostWork = nullptr;
-   if (deviceWorkspaceSize > 0) {
-      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
-   }
-   if (hostWorkspaceSize > 0) {
-      hostWork = malloc(hostWorkspaceSize);
-   }
-   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
-                                               workDesc,
-                                               CUTENSORNET_MEMSPACE_DEVICE,
-                                               CUTENSORNET_WORKSPACE_SCRATCH,
-                                               devWork,
-                                               deviceWorkspaceSize) );
-   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
-                                               workDesc,
-                                               CUTENSORNET_MEMSPACE_HOST,
-                                               CUTENSORNET_WORKSPACE_SCRATCH,
-                                               hostWork,
-                                               hostWorkspaceSize) );
-
-   /**********
+  cutensornetWorkspaceDescriptor_t workDesc;
+  HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
+  HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
+      handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc));
+  int64_t hostWorkspaceSize, deviceWorkspaceSize;
+  // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
+  HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
+      handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
+      &deviceWorkspaceSize));
+  HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
+      handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH,
+      &hostWorkspaceSize));
+
+  void *devWork = nullptr, *hostWork = nullptr;
+  if (deviceWorkspaceSize > 0) {
+    HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize));
+  }
+  if (hostWorkspaceSize > 0) {
+    hostWork = malloc(hostWorkspaceSize);
+  }
+  HANDLE_ERROR(cutensornetWorkspaceSetMemory(
+      handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
+      CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize));
+  HANDLE_ERROR(cutensornetWorkspaceSetMemory(
+      handle, workDesc, CUTENSORNET_MEMSPACE_HOST,
+      CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));
+
+  /**********
    * Execution
    ***********/
 
-   const int numRuns = 3; // to get stable perf results
-   for (int i=0; i < numRuns; ++i)
-   {
-      // restore output
-      cudaMemsetAsync(D_U, 0, sizeU, stream);
-      cudaMemsetAsync(D_S, 0, sizeS, stream);
-      cudaMemsetAsync(D_V, 0, sizeV, stream);
-      cudaDeviceSynchronize();
-
-      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
-      // We here restore descTensorU/V to the original problem.
-      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
-      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
-      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) );
-      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) );
-
-      HANDLE_ERROR( cutensornetTensorSVD(handle,
-                        descTensorIn, D_T,
-                        descTensorU, D_U,
-                        D_S,
-                        descTensorV, D_V,
-                        svdConfig,
-                        svdInfo,
-                        workDesc,
-                        stream) );
-      // Synchronize and measure timing
-   }
-
-
-   HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost) );
-   HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost) );
-   HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost) );
-
-   S.clear();
-   for (int i = 0; i < min_dim; i++)
-	   S.push_back(cutensor_S[i]);
-
-   A = cmatrix_t::move_from_buffer(m, n, cutensor_A);
-   U = cmatrix_t::move_from_buffer(m, m, cutensor_U);
-   V = cmatrix_t::move_from_buffer(n, n, cutensor_V);
-
-   /*************************************
+  const int numRuns = 3; // to get stable perf results
+  for (int i = 0; i < numRuns; ++i) {
+    // restore output
+    cudaMemsetAsync(D_U, 0, sizeU, stream);
+    cudaMemsetAsync(D_S, 0, sizeS, stream);
+    cudaMemsetAsync(D_V, 0, sizeV, stream);
+    cudaDeviceSynchronize();
+
+    // With value-based truncation, `cutensornetTensorSVD` can potentially
+    // update the shared extent in descTensorU/V. We here restore descTensorU/V
+    // to the original problem.
+    HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
+    HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
+    HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+        handle, numModesU, extentU.data(), stridesU.data(), modesU.data(),
+        typeData, &descTensorU));
+    HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+        handle, numModesV, extentV.data(), stridesV.data(), modesV.data(),
+        typeData, &descTensorV));
+
+    HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorIn, D_T, descTensorU,
+                                      D_U, D_S, descTensorV, D_V, svdConfig,
+                                      svdInfo, workDesc, stream));
+    // Synchronize and measure timing
+  }
+
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));
+
+  S.clear();
+  for (int i = 0; i < min_dim; i++)
+    S.push_back(cutensor_S[i]);
+
+  A = cmatrix_t::move_from_buffer(m, n, cutensor_A);
+  U = cmatrix_t::move_from_buffer(m, m, cutensor_U);
+  V = cmatrix_t::move_from_buffer(n, n, cutensor_V);
+
+  /*************************************
    * Query runtime truncation information
    **************************************/
 
-   double discardedWeight{0};
-   int64_t reducedExtent{0};
-   cutensornetGesvdjStatus_t gesvdjStatus;
-   cudaDeviceSynchronize(); // device synchronization.
-   HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, &discardedWeight, sizeof(discardedWeight)) );
-   HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, &reducedExtent, sizeof(reducedExtent)) );
-   HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, sizeof(gesvdjStatus)) );
-
-   /***************
+  double discardedWeight{0};
+  int64_t reducedExtent{0};
+  cutensornetGesvdjStatus_t gesvdjStatus;
+  cudaDeviceSynchronize(); // device synchronization.
+  HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute(
+      handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT,
+      &discardedWeight, sizeof(discardedWeight)));
+  HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute(
+      handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT,
+      &reducedExtent, sizeof(reducedExtent)));
+  HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute(
+      handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus,
+      sizeof(gesvdjStatus)));
+
+  /***************
    * Free resources
    ****************/
 
-   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
-   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
-   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
-   HANDLE_ERROR( cutensornetDestroyTensorSVDConfig(svdConfig) );
-   HANDLE_ERROR( cutensornetDestroyTensorSVDInfo(svdInfo) );
-   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
-   HANDLE_ERROR( cutensornetDestroy(handle) );
-
-   if (D_T) cudaFree(D_T);
-   if (D_U) cudaFree(D_U);
-   if (D_S) cudaFree(D_S);
-   if (D_V) cudaFree(D_V);
-   if (devWork) cudaFree(devWork);
-   if (hostWork) free(hostWork);
-
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorIn));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
+  HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig));
+  HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo));
+  HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+  HANDLE_ERROR(cutensornetDestroy(handle));
+
+  if (D_T)
+    cudaFree(D_T);
+  if (D_U)
+    cudaFree(D_U);
+  if (D_S)
+    cudaFree(D_S);
+  if (D_V)
+    cudaFree(D_V);
+  if (devWork)
+    cudaFree(devWork);
+  if (hostWork)
+    free(hostWork);
 }
-//#endif // AER_THRUST_CUDA
-
+// #endif // AER_THRUST_CUDA
 
 } // namespace AER

From a42269042f199a7e8b1ffa65e3b0ecfb0b57a28f Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 09:40:47 +0000
Subject: [PATCH 08/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 6c7d432df5..dcc515b0c2 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -770,7 +770,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   cutensornetHandle_t handle;
   HANDLE_ERROR(cutensornetCreate(&handle));
 
-  cutensornetTensorDescriptor_t descTensorIn;
+  cutensornetTensorDescriptor_t descTensorA;
   cutensornetTensorDescriptor_t descTensorU;
   cutensornetTensorDescriptor_t descTensorV;
 
@@ -778,7 +778,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   const int32_t numModesU = modesU.size();
   const int32_t numModesV = modesV.size();
 
-  std::vector<int64_t> extentA{lda, min_dim};             // shape of A
+  std::vector<int64_t> extentA{lda, min_dim};     // shape of A
   std::vector<int64_t> extentU{lda, lda};         // shape of U
   std::vector<int64_t> extentS{min_dim};          // shape of S
   std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
@@ -789,7 +789,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
   HANDLE_ERROR(cutensornetCreateTensorDescriptor(
       handle, numModesA, extentA.data(), stridesA.data(), modesA.data(),
-      typeData, &descTensorIn));
+      typeData, &descTensorA));
   HANDLE_ERROR(cutensornetCreateTensorDescriptor(
       handle, numModesU, extentU.data(), stridesU.data(), modesU.data(),
       typeData, &descTensorU));
@@ -835,7 +835,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   cutensornetWorkspaceDescriptor_t workDesc;
   HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
   HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
-      handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc));
+      handle, descTensorA, descTensorU, descTensorV, svdConfig, workDesc));
   int64_t hostWorkspaceSize, deviceWorkspaceSize;
   // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
   HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
@@ -885,7 +885,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
         handle, numModesV, extentV.data(), stridesV.data(), modesV.data(),
         typeData, &descTensorV));
 
-    HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorIn, D_T, descTensorU,
+    HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_T, descTensorU,
                                       D_U, D_S, descTensorV, D_V, svdConfig,
                                       svdInfo, workDesc, stream));
     // Synchronize and measure timing
@@ -902,9 +902,9 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   for (int i = 0; i < min_dim; i++)
     S.push_back(cutensor_S[i]);
 
-  A = cmatrix_t::move_from_buffer(m, n, cutensor_A);
-  U = cmatrix_t::move_from_buffer(m, m, cutensor_U);
-  V = cmatrix_t::move_from_buffer(n, n, cutensor_V);
+  A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
+  U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U);
+  V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
 
   /*************************************
    * Query runtime truncation information
@@ -928,7 +928,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
    * Free resources
    ****************/
 
-  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorIn));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
   HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig));

From 80b59d508c587c75811856fb9a12f4ba8ad6527d Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 6 Jun 2024 09:59:45 +0000
Subject: [PATCH 09/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index dcc515b0c2..7e4aa586ea 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -713,7 +713,7 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   }
 }
 
-// #ifdef AER_THRUST_CUDA
+#ifdef AER_THRUST_CUDA
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
                            cmatrix_t &V) {
   const int64_t m = A.GetRows(), n = A.GetColumns();
@@ -906,6 +906,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U);
   V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
 
+  validate_SVD_result(A, U, S, V);
+
   /*************************************
    * Query runtime truncation information
    **************************************/
@@ -949,6 +951,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   if (hostWork)
     free(hostWork);
 }
-// #endif // AER_THRUST_CUDA
+#endif // AER_THRUST_CUDA
 
 } // namespace AER

From 52f1ed4f30a5736311b19ca0e18fe2da68662e47 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Fri, 7 Jun 2024 08:47:05 +0000
Subject: [PATCH 10/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 7e4aa586ea..b96f33c2c0 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -738,19 +738,19 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
   HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 
-  typedef float floatType;
+  typedef double floatType;
   cudaDataType_t typeData = CUDA_C_64F;
 
-  std::vector<int32_t> modesA{'a', 'b'}; // input
-  std::vector<int32_t> modesU{'c', 'd'};
-  std::vector<int32_t> modesV{'e', 'c'}; // SVD output
+  std::vector<int32_t> modesA{'m', 'n'}; // input
+  std::vector<int32_t> modesU{'n', 'x'};
+  std::vector<int32_t> modesV{'x', 'n'}; // SVD output
 
   size_t sizeA = sizeof(A);
   size_t sizeU = sizeof(U);
   size_t sizeS = sizeof(S);
   size_t sizeV = sizeof(V);
 
-  double *cutensor_S = (double *)malloc(sizeof(S));
+  floatType *cutensor_S = (floatType *)malloc(sizeof(S));
 
   void *D_T;
   void *D_U;
@@ -779,12 +779,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   const int32_t numModesV = modesV.size();
 
   std::vector<int64_t> extentA{lda, min_dim};     // shape of A
-  std::vector<int64_t> extentU{lda, lda};         // shape of U
+  std::vector<int64_t> extentU{min_dim, min_dim}; // shape of U
   std::vector<int64_t> extentS{min_dim};          // shape of S
   std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
 
   std::vector<int64_t> stridesA{n, 1};
-  std::vector<int64_t> stridesU{lda, 1};
+  std::vector<int64_t> stridesU{min_dim, 1};
   std::vector<int64_t> stridesV{min_dim, 1};
 
   HANDLE_ERROR(cutensornetCreateTensorDescriptor(
@@ -888,7 +888,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
     HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_T, descTensorU,
                                       D_U, D_S, descTensorV, D_V, svdConfig,
                                       svdInfo, workDesc, stream));
-    // Synchronize and measure timing
   }
 
   HANDLE_CUDA_ERROR(

From ed43e7155d500906ba1b576dd8151529f00c059f Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Sun, 9 Jun 2024 07:06:48 +0530
Subject: [PATCH 11/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 65 ++++++++++-----------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index b96f33c2c0..94fa5ab349 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -29,37 +29,6 @@
 
 namespace AER {
 
-#ifdef AER_THRUST_CUDA
-
-#include <cassert>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cutensornet.h>
-#include <vector>
-
-#define HANDLE_ERROR(x)                                                        \
-  {                                                                            \
-    const auto err = x;                                                        \
-    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
-      std::stringstream str;                                                   \
-      str << "ERROR TensorNet::contractor : "                                  \
-          << cutensornetGetErrorString(err);                                   \
-      throw std::runtime_error(str.str());                                     \
-    }                                                                          \
-  };
-
-#define HANDLE_CUDA_ERROR(x)                                                   \
-  {                                                                            \
-    const auto err = x;                                                        \
-    if (err != cudaSuccess) {                                                  \
-      std::stringstream str;                                                   \
-      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
-      throw std::runtime_error(str.str());                                     \
-    }                                                                          \
-  };
-
-#endif // AER_THRUST_CUDA
-
 // default values
 constexpr auto mul_factor = 1e2;
 constexpr long double tiny_factor = 1e30;
@@ -714,6 +683,35 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 }
 
 #ifdef AER_THRUST_CUDA
+
+#include <cassert>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cutensornet.h>
+#include <vector>
+
+#define HANDLE_ERROR(x)                                                        \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : "                                  \
+          << cutensornetGetErrorString(err);                                   \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+namespace TensorNetwork {
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
                            cmatrix_t &V) {
   const int64_t m = A.GetRows(), n = A.GetColumns();
@@ -750,7 +748,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   size_t sizeS = sizeof(S);
   size_t sizeV = sizeof(V);
 
-  floatType *cutensor_S = (floatType *)malloc(sizeof(S));
+  std::complex<double> *cutensor_S = (std::complex<double> *)malloc(sizeof(S));
 
   void *D_T;
   void *D_U;
@@ -899,7 +897,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
   S.clear();
   for (int i = 0; i < min_dim; i++)
-    S.push_back(cutensor_S[i]);
+    S.push_back(cutensor_S[i].real());
 
   A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
   U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U);
@@ -950,6 +948,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   if (hostWork)
     free(hostWork);
 }
+} // namespace TensorNetwork
 #endif // AER_THRUST_CUDA
 
 } // namespace AER

From 649a5d73b91618df515e6df0ff96f724072607d3 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Sun, 9 Jun 2024 13:32:15 +0530
Subject: [PATCH 12/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 68 ++++++++-------------
 src/simulators/matrix_product_state/svd.hpp |  5 +-
 2 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 94fa5ab349..2901e75d91 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -684,7 +684,6 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
 #ifdef AER_THRUST_CUDA
 
-#include <cassert>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cutensornet.h>
@@ -717,7 +716,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   const int64_t m = A.GetRows(), n = A.GetColumns();
   const int64_t min_dim = std::min(m, n);
   const int64_t lda = std::max(m, n);
-  int64_t lwork = 2 * min_dim + lda;
 
   // Transpose when m < n
   bool transposed = false;
@@ -748,19 +746,19 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   size_t sizeS = sizeof(S);
   size_t sizeV = sizeof(V);
 
-  std::complex<double> *cutensor_S = (std::complex<double> *)malloc(sizeof(S));
+  complex_t *cutensor_S = (complex_t *)malloc(sizeof(S));
 
-  void *D_T;
+  void *D_A;
   void *D_U;
   void *D_S;
   void *D_V;
 
-  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_T, sizeA));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA));
   HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU));
   HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS));
   HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV));
 
-  HANDLE_CUDA_ERROR(cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice));
+  HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
@@ -781,29 +779,28 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   std::vector<int64_t> extentS{min_dim};          // shape of S
   std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
 
-  std::vector<int64_t> stridesA{n, 1};
-  std::vector<int64_t> stridesU{min_dim, 1};
-  std::vector<int64_t> stridesV{min_dim, 1};
+  const int64_t *strides =
+      NULL; // matrices stores the entries in column-major-order.
 
   HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-      handle, numModesA, extentA.data(), stridesA.data(), modesA.data(),
-      typeData, &descTensorA));
+      handle, numModesA, extentA.data(), strides, modesA.data(), typeData,
+      &descTensorA));
   HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-      handle, numModesU, extentU.data(), stridesU.data(), modesU.data(),
-      typeData, &descTensorU));
+      handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
+      &descTensorU));
   HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-      handle, numModesV, extentV.data(), stridesV.data(), modesV.data(),
-      typeData, &descTensorV));
+      handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
+      &descTensorV));
 
   cutensornetTensorSVDConfig_t svdConfig;
   HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig));
 
   // set up truncation parameters
-  double absCutoff = 1e-2;
+  double absCutoff = 1e-13;
   HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
       handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff,
       sizeof(absCutoff)));
-  double relCutoff = 4e-2;
+  double relCutoff = 4e-13;
   HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
       handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff,
       sizeof(relCutoff)));
@@ -877,13 +874,13 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
     HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
     HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
     HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-        handle, numModesU, extentU.data(), stridesU.data(), modesU.data(),
-        typeData, &descTensorU));
+        handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
+        &descTensorU));
     HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-        handle, numModesV, extentV.data(), stridesV.data(), modesV.data(),
-        typeData, &descTensorV));
+        handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
+        &descTensorV));
 
-    HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_T, descTensorU,
+    HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU,
                                       D_U, D_S, descTensorV, D_V, svdConfig,
                                       svdInfo, workDesc, stream));
   }
@@ -903,25 +900,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U);
   V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
 
-  validate_SVD_result(A, U, S, V);
+  if (transposed) {
+    transposed = false;
+    A = AER::Utils::dagger(A);
+  }
 
-  /*************************************
-   * Query runtime truncation information
-   **************************************/
-
-  double discardedWeight{0};
-  int64_t reducedExtent{0};
-  cutensornetGesvdjStatus_t gesvdjStatus;
-  cudaDeviceSynchronize(); // device synchronization.
-  HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute(
-      handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT,
-      &discardedWeight, sizeof(discardedWeight)));
-  HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute(
-      handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT,
-      &reducedExtent, sizeof(reducedExtent)));
-  HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute(
-      handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus,
-      sizeof(gesvdjStatus)));
+  validate_SVD_result(A, U, S, V);
 
   /***************
    * Free resources
@@ -935,8 +919,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
   HANDLE_ERROR(cutensornetDestroy(handle));
 
-  if (D_T)
-    cudaFree(D_T);
+  if (D_A)
+    cudaFree(D_A);
   if (D_U)
     cudaFree(D_U);
   if (D_S)
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index c35a9c4406..0f0715b480 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -49,9 +49,12 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
 // Lapack call
 void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
                          cmatrix_t &V);
+
+#ifdef AER_THRUST_CUDA
 // cutensor call
 void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
-			cmatrix_t &V);
+                           cmatrix_t &V);
+#endif // AER_THRUST_CUDA
 
 void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
                          const rvector_t &S, const cmatrix_t &V);

From c33571fd51c7abf07271330d729956cc454bd7dd Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 11 Jun 2024 14:59:43 +0000
Subject: [PATCH 13/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 51 +++++++++------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index b022caa12a..2a5418dd51 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -710,19 +710,13 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
     }                                                                          \
   };
 
-namespace TensorNetwork {
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
                            cmatrix_t &V) {
-  const int64_t m = A.GetRows(), n = A.GetColumns();
-  const int64_t min_dim = std::min(m, n);
-  const int64_t lda = std::max(m, n);
+  cmatrix_t A_cpy = A;
 
-  // Transpose when m < n
-  bool transposed = false;
-  if (m < n) {
-    transposed = true;
-    A = AER::Utils::dagger(A);
-  }
+  const int64_t m = A.GetRows(), n = A.GetColumns();
+  U.resize(n, n);
+  V.resize(n, n);
 
   complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
             *cutensor_V = V.move_to_buffer();
@@ -741,12 +735,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   std::vector<int32_t> modesU{'n', 'x'};
   std::vector<int32_t> modesV{'x', 'n'}; // SVD output
 
-  size_t sizeA = sizeof(A);
-  size_t sizeU = sizeof(U);
-  size_t sizeS = sizeof(S);
-  size_t sizeV = sizeof(V);
+  size_t sizeA = A.size() * sizeof(cmatrix_t);
+  size_t sizeU = U.size() * sizeof(cmatrix_t);
+  size_t sizeS = S.size() * sizeof(complex_t);
+  size_t sizeV = V.size() * sizeof(cmatrix_t);
 
-  complex_t *cutensor_S = (complex_t *)malloc(sizeof(S));
+  complex_t *cutensor_S = (complex_t *)malloc(sizeS);
 
   void *D_A;
   void *D_U;
@@ -774,10 +768,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   const int32_t numModesU = modesU.size();
   const int32_t numModesV = modesV.size();
 
-  std::vector<int64_t> extentA{lda, min_dim};     // shape of A
-  std::vector<int64_t> extentU{min_dim, min_dim}; // shape of U
-  std::vector<int64_t> extentS{min_dim};          // shape of S
-  std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
+  std::vector<int64_t> extentA{m, n}; // shape of A
+  std::vector<int64_t> extentU{n, n}; // shape of U :)
+  std::vector<int64_t> extentS{n};    // shape of S
+  std::vector<int64_t> extentV{n, n}; // shape of V
 
   const int64_t *strides =
       NULL; // matrices stores the entries in column-major-order.
@@ -893,19 +887,14 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
       cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));
 
   S.clear();
-  for (int i = 0; i < min_dim; i++)
+  for (int i = 0; i < n; i++)
     S.push_back(cutensor_S[i].real());
 
-  A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
-  U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U);
-  V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
-
-  if (transposed) {
-    transposed = false;
-    A = AER::Utils::dagger(A);
-  }
+  A = cmatrix_t::move_from_buffer(m, n, cutensor_A);
+  U = cmatrix_t::move_from_buffer(n, n, cutensor_U);
+  V = cmatrix_t::move_from_buffer(n, n, cutensor_V);
 
-  validate_SVD_result(A, U, S, V);
+  validate_SVD_result(A_cpy, U, S, V);
 
   /***************
    * Free resources
@@ -917,8 +906,11 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig));
   HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo));
   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+  HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
   HANDLE_ERROR(cutensornetDestroy(handle));
 
+  if (cutensor_S)
+    free(cutensor_S);
   if (D_A)
     cudaFree(D_A);
   if (D_U)
@@ -932,7 +924,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   if (hostWork)
     free(hostWork);
 }
-} // namespace TensorNetwork
 #endif // AER_THRUST_CUDA
 
 } // namespace AER

From f0205e37f281e0f68da784685f0b5fed865458a5 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Fri, 14 Jun 2024 14:02:08 +0000
Subject: [PATCH 14/27] refactor code

---
 .../matrix_product_state.hpp                  |  3 +++
 .../matrix_product_state_internal.cpp         | 10 +++++---
 .../matrix_product_state_internal.hpp         |  8 +++++--
 .../matrix_product_state_tensor.hpp           |  6 ++---
 src/simulators/matrix_product_state/svd.cpp   | 24 ++++++-------------
 src/simulators/matrix_product_state/svd.hpp   |  2 +-
 6 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index 1cfeb9b43d..676f3b9425 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -373,6 +373,9 @@ void State::set_config(const Config &config) {
 
   // Set LAPACK SVD
   MPS::set_mps_lapack_svd(config.mps_lapack);
+
+  // Set device for SVD
+  MPS::set_mps_svd_device(config.device);
 }
 
 void State::add_metadata(ExperimentResult &result) const {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index e6cacd0239..e02b3fec88 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -19,6 +19,7 @@
 #include "stdlib.h"
 #include "string.h"
 #include <iostream>
+#include <string>
 #include <utility>
 
 #include "framework/linalg/almost_equal.hpp"
@@ -45,6 +46,7 @@ double MPS::json_chop_threshold_ = 1E-8;
 std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
+std::string MPS::mps_svd_device_;
 
 //------------------------------------------------------------------------
 // local function declarations
@@ -663,8 +665,9 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
-  double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
-                                                 right_gamma, MPS::mps_lapack_);
+  double discarded_value =
+      MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma,
+                            MPS::mps_lapack_, MPS::mps_svd_device_);
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1803,7 +1806,8 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     // step 2 - SVD
     S.clear();
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
-    csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+    csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_,
+                 MPS::mps_svd_device_);
     reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
                  MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 1180e6cddf..04eb08e7fc 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -15,12 +15,12 @@
 #ifndef _aer_matrix_product_state_hpp_
 #define _aer_matrix_product_state_hpp_
 
-#include <cstdarg>
-
 #include "framework/json.hpp"
 #include "framework/operations.hpp"
 #include "framework/utils.hpp"
 #include "matrix_product_state_tensor.hpp"
+#include <cstdarg>
+#include <string>
 
 namespace AER {
 namespace MatrixProductState {
@@ -321,6 +321,9 @@ class MPS {
   }
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
+  static void set_mps_svd_device(std::string mps_svd_device) {
+    mps_svd_device_ = mps_svd_device;
+  }
 
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
@@ -570,6 +573,7 @@ class MPS {
   static bool mps_log_data_;
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
+  static std::string mps_svd_device_;
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index b769e8a59f..ed9d0bf1be 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -159,7 +159,7 @@ class MPS_Tensor {
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack);
+                          bool mps_lapack, std::string mps_svd_device);
   static void reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> data,
                                               MPS_Tensor &reshaped_tensor);
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
@@ -592,13 +592,13 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //---------------------------------------------------------------
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack) {
+                             bool mps_lapack, std::string mps_svd_device) {
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
-  csvd_wrapper(C, U, S, V, mps_lapack);
+  csvd_wrapper(C, U, S, V, mps_lapack, mps_svd_device);
   double discarded_value = 0.0;
   discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
                                  truncation_threshold_, mps_lapack);
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 2a5418dd51..30d102459f 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -551,25 +551,15 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) {
 }
 
 void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
-                  bool lapack) {
-  if (lapack) {
-
-#ifdef AER_THRUST_CUDA
+                  bool lapack, std::string mps_svd_device) {
+  if (mps_svd_device.compare("GPU") == 0) {
     cutensor_csvd_wrapper(A, U, S, V);
-#endif // AER_THRUST_CUDA
-
-#ifndef AER_THRUST_CUDA
-    lapack_csvd_wrapper(A, U, S, V);
-#endif // AER_THRUST_CUDA
   } else {
-
-#ifdef AER_THRUST_CUDA
-    cutensor_csvd_wrapper(A, U, S, V);
-#endif // AER_THRUST_CUDA
-
-#ifndef AER_THRUST_CUDA
-    qiskit_csvd_wrapper(A, U, S, V);
-#endif // AER_THRUST_CUDA
+    if (lapack) {
+      lapack_csvd_wrapper(A, U, S, V);
+    } else {
+      qiskit_csvd_wrapper(A, U, S, V);
+    }
   }
 }
 
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index 0f0715b480..9c44cfb83b 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -42,7 +42,7 @@ double reduce_zeros(cmatrix_t &U, rvector_t &S, cmatrix_t &V,
 status csvd(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V);
 // Entry point for the SVD calculation
 void csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
-                  bool lapack);
+                  bool lapack, std::string mps_svd_device);
 // Original qiskit call
 void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
                          cmatrix_t &V);

From 629f65f5c0262c7571dab724076a45525f212cc7 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Sat, 15 Jun 2024 12:25:34 +0000
Subject: [PATCH 15/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 133 +++++++-------------
 1 file changed, 44 insertions(+), 89 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 30d102459f..0c8d26d7c9 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -553,7 +553,10 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) {
 void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
                   bool lapack, std::string mps_svd_device) {
   if (mps_svd_device.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
     cutensor_csvd_wrapper(A, U, S, V);
+#endif // AER_THRUST_CUDA
+
   } else {
     if (lapack) {
       lapack_csvd_wrapper(A, U, S, V);
@@ -702,11 +705,28 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
                            cmatrix_t &V) {
+
+  bool transposed = false;
+
+  const int64_t rows = A.GetRows(), cols = A.GetColumns();
+
+  if (rows < cols) {
+    transposed = true;
+    A = AER::Utils::dagger(A);
+  }
   cmatrix_t A_cpy = A;
 
-  const int64_t m = A.GetRows(), n = A.GetColumns();
-  U.resize(n, n);
-  V.resize(n, n);
+  const int64_t min_dim = std::min(rows, cols);
+  const int64_t lda = std::max(rows, cols);
+
+  U.resize(lda, min_dim);
+  V.resize(min_dim, min_dim);
+  S.resize(min_dim);
+
+  size_t sizeA = A.size() * sizeof(complex_t);
+  size_t sizeU = U.size() * sizeof(complex_t);
+  size_t sizeS = S.size() * sizeof(double);
+  size_t sizeV = V.size() * sizeof(complex_t);
 
   complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
             *cutensor_V = V.move_to_buffer();
@@ -718,19 +738,13 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
   HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 
-  typedef double floatType;
   cudaDataType_t typeData = CUDA_C_64F;
 
-  std::vector<int32_t> modesA{'m', 'n'}; // input
-  std::vector<int32_t> modesU{'n', 'x'};
-  std::vector<int32_t> modesV{'x', 'n'}; // SVD output
-
-  size_t sizeA = A.size() * sizeof(cmatrix_t);
-  size_t sizeU = U.size() * sizeof(cmatrix_t);
-  size_t sizeS = S.size() * sizeof(complex_t);
-  size_t sizeV = V.size() * sizeof(cmatrix_t);
+  std::vector<int32_t> modesA{'m', 'n'};
+  std::vector<int32_t> modesU{'m', 'x'};
+  std::vector<int32_t> modesV{'x', 'n'};
 
-  complex_t *cutensor_S = (complex_t *)malloc(sizeS);
+  double *cutensor_S = (double *)malloc(sizeS);
 
   void *D_A;
   void *D_U;
@@ -758,10 +772,9 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   const int32_t numModesU = modesU.size();
   const int32_t numModesV = modesV.size();
 
-  std::vector<int64_t> extentA{m, n}; // shape of A
-  std::vector<int64_t> extentU{n, n}; // shape of U :)
-  std::vector<int64_t> extentS{n};    // shape of S
-  std::vector<int64_t> extentV{n, n}; // shape of V
+  std::vector<int64_t> extentA{lda, min_dim};     // shape of A
+  std::vector<int64_t> extentU{lda, min_dim};     // shape of U :)
+  std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
 
   const int64_t *strides =
       NULL; // matrices stores the entries in column-major-order.
@@ -776,45 +789,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
       handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
       &descTensorV));
 
-  cutensornetTensorSVDConfig_t svdConfig;
-  HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig));
-
-  // set up truncation parameters
-  double absCutoff = 1e-13;
-  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
-      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff,
-      sizeof(absCutoff)));
-  double relCutoff = 4e-13;
-  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
-      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff,
-      sizeof(relCutoff)));
-
-  // optional: choose gesvdj algorithm with customized parameters. Default is
-  // gesvd.
-  cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
-  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
-      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, &svdAlgo,
-      sizeof(svdAlgo)));
-  cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
-  HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
-      handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
-      &gesvdjParams, sizeof(gesvdjParams)));
-
-  /********************************************************
-   * Create SVDInfo to record runtime SVD truncation details
-   *********************************************************/
-
-  cutensornetTensorSVDInfo_t svdInfo;
-  HANDLE_ERROR(cutensornetCreateTensorSVDInfo(handle, &svdInfo));
-
-  /**************************************************************
-   * Query the required workspace sizes and allocate memory
-   **************************************************************/
-
   cutensornetWorkspaceDescriptor_t workDesc;
   HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
   HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
-      handle, descTensorA, descTensorU, descTensorV, svdConfig, workDesc));
+      handle, descTensorA, descTensorU, descTensorV, NULL, workDesc));
   int64_t hostWorkspaceSize, deviceWorkspaceSize;
   // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
   HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
@@ -840,34 +818,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
       handle, workDesc, CUTENSORNET_MEMSPACE_HOST,
       CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));
 
-  /**********
-   * Execution
-   ***********/
-
-  const int numRuns = 3; // to get stable perf results
-  for (int i = 0; i < numRuns; ++i) {
-    // restore output
-    cudaMemsetAsync(D_U, 0, sizeU, stream);
-    cudaMemsetAsync(D_S, 0, sizeS, stream);
-    cudaMemsetAsync(D_V, 0, sizeV, stream);
-    cudaDeviceSynchronize();
-
-    // With value-based truncation, `cutensornetTensorSVD` can potentially
-    // update the shared extent in descTensorU/V. We here restore descTensorU/V
-    // to the original problem.
-    HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
-    HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
-    HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-        handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
-        &descTensorU));
-    HANDLE_ERROR(cutensornetCreateTensorDescriptor(
-        handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
-        &descTensorV));
-
-    HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU,
-                                      D_U, D_S, descTensorV, D_V, svdConfig,
-                                      svdInfo, workDesc, stream));
-  }
+  // Requesting for Exact SVD.
+  HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U,
+                                    D_S, descTensorV, D_V, NULL, NULL, workDesc,
+                                    stream));
 
   HANDLE_CUDA_ERROR(
       cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost));
@@ -877,14 +831,17 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
       cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));
 
   S.clear();
-  for (int i = 0; i < n; i++)
-    S.push_back(cutensor_S[i].real());
+  for (int i = 0; i < min_dim; i++)
+    S.push_back(cutensor_S[i]);
 
-  A = cmatrix_t::move_from_buffer(m, n, cutensor_A);
-  U = cmatrix_t::move_from_buffer(n, n, cutensor_U);
-  V = cmatrix_t::move_from_buffer(n, n, cutensor_V);
+  A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
+  U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U);
+  V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
 
-  validate_SVD_result(A_cpy, U, S, V);
+  validate_SVdD_result(A_cpy, U, S, V);
+  if (transposed) {
+    std::swap(U, V);
+  }
 
   /***************
    * Free resources
@@ -893,8 +850,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
-  HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig));
-  HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo));
   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
   HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
   HANDLE_ERROR(cutensornetDestroy(handle));

From 644a822ac86eb388d9f1b8d34a6424d9338deb4b Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Sun, 16 Jun 2024 23:59:24 +0000
Subject: [PATCH 16/27] added release note

---
 ...s-svd-with-cuquantum-c0392854d1f373e0.yaml | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml

diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
new file mode 100644
index 0000000000..ad5d08cf6e
--- /dev/null
+++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
@@ -0,0 +1,40 @@
+---
+features:
+  - |
+    This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.
+    To be precise, this PR offloads the Singular Value Decomposition required for
+    Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.
+    
+    While choosing for the backend for Matrix Product State simulation users can choose all as usual, but
+    this time they can choose the device as GPU.
+
+    Example
+
+    .. code-block:: python
+
+      from qiskit_aer import AerSimulator
+      from qiskit.circuit import QuantumCircuit
+      from qiskit.compiler import transpile
+      
+      num_qubits = 10
+      shots = 5
+      
+      qc = QuantumCircuit(num_qubits)
+      qc.h(0)
+      
+      for control, target in zip(range(num_qubits-1), range(1, num_qubits)):
+        qc.cx(control, target)
+        
+      qc.measure_all()
+      
+      sim = AerSimulator(method="matrix_product_state", device="GPU")
+      qc_t = transpile(qc, backend=sim)
+      job = sim.run(qc_t, shots = shots)
+      
+      counts = job.result().get_counts()
+      counts
+
+
+
+
+

From e6f228820c0cf3f1327ef1c8a291082bd20e01f0 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Mon, 17 Jun 2024 11:42:36 +0000
Subject: [PATCH 17/27] refactor code

---
 ...s-svd-with-cuquantum-c0392854d1f373e0.yaml |  4 +-
 .../matrix_product_state.hpp                  |  5 ++
 .../matrix_product_state_internal.cpp         | 20 +++++--
 .../matrix_product_state_internal.hpp         | 24 +++++++-
 .../matrix_product_state_tensor.hpp           | 14 ++++-
 src/simulators/matrix_product_state/svd.cpp   | 60 ++-----------------
 src/simulators/matrix_product_state/svd.hpp   | 43 ++++++++++---
 7 files changed, 98 insertions(+), 72 deletions(-)

diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
index ad5d08cf6e..8e065ad484 100644
--- a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
+++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
@@ -5,8 +5,8 @@ features:
     To be precise, this PR offloads the Singular Value Decomposition required for
     Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.
     
-    While choosing for the backend for Matrix Product State simulation users can choose all as usual, but
-    this time they can choose the device as GPU.
+    While choosing for the backend for Matrix Product State simulation users can 
+    choose all as usual, but this time they can choose the device as GPU.
 
     Example
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index 676f3b9425..2f4b332395 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -376,6 +376,11 @@ void State::set_config(const Config &config) {
 
   // Set device for SVD
   MPS::set_mps_svd_device(config.device);
+
+  // Get CUDA device, if GPU offloading enabled
+  if (config.device.compare("GPU") == 0) {
+    MPS::set_cuda_device();
+  }
 }
 
 void State::add_metadata(ExperimentResult &result) const {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index e02b3fec88..c301121960 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -48,6 +48,10 @@ bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
 std::string MPS::mps_svd_device_;
 
+// if cuda is present.
+cutensornetHandle_t MPS::cuda_handle_ = NULL;
+cudaStream_t MPS::cuda_stream_ = NULL;
+
 //------------------------------------------------------------------------
 // local function declarations
 //------------------------------------------------------------------------
@@ -665,9 +669,9 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
-  double discarded_value =
-      MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma,
-                            MPS::mps_lapack_, MPS::mps_svd_device_);
+  double discarded_value = MPS_Tensor::Decompose(
+      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
+      MPS::mps_svd_device_, MPS::cuda_stream_, MPS::cuda_handle_);
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1806,8 +1810,14 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     // step 2 - SVD
     S.clear();
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
-    csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_,
-                 MPS::mps_svd_device_);
+
+    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+      cutensor_csvd_wrapper(reshaped_matrix, U, S, V, MPS::cuda_stream_,
+                            MPS::cuda_handle_);
+    } else {
+      csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+    }
+
     reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
                  MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 04eb08e7fc..71e067dcc5 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -82,7 +82,15 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT };
 class MPS {
 public:
   MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {}
-  ~MPS() {}
+  ~MPS() {
+    if (cuda_stream_) {
+      HANDLE_CUDA_ERROR(cudaStreamDestroy(cuda_stream_));
+    }
+
+    if (cuda_handle_) {
+      HANDLE_ERROR(cutensornetDestroy(cuda_handle_));
+    }
+  }
 
   //--------------------------------------------------------------------------
   // Function name: initialize
@@ -325,6 +333,18 @@ class MPS {
     mps_svd_device_ = mps_svd_device;
   }
 
+  static void set_cuda_device() {
+    // the prop could be used to log the properties of the device.
+
+    cudaDeviceProp prop;
+    int deviceId{-1};
+    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
+    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+
+    HANDLE_CUDA_ERROR(cudaStreamCreate(&MPS::cuda_stream_));
+    HANDLE_ERROR(cutensornetCreate(&MPS::cuda_handle_));
+  }
+
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
   static double get_json_chop_threshold() { return json_chop_threshold_; }
@@ -574,6 +594,8 @@ class MPS {
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
   static std::string mps_svd_device_;
+  static cutensornetHandle_t cuda_handle_;
+  static cudaStream_t cuda_stream_;
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index ed9d0bf1be..b57e7e0198 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -159,7 +159,9 @@ class MPS_Tensor {
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack, std::string mps_svd_device);
+                          bool mps_lapack, std::string mps_svd_device,
+                          cudaStream_t cuda_stream,
+                          cutensornetHandle_t cuda_handle);
   static void reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> data,
                                               MPS_Tensor &reshaped_tensor);
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
@@ -592,13 +594,19 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //---------------------------------------------------------------
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack, std::string mps_svd_device) {
+                             bool mps_lapack, std::string mps_svd_device,
+                             cudaStream_t cuda_stream,
+                             cutensornetHandle_t cuda_handle) {
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
-  csvd_wrapper(C, U, S, V, mps_lapack, mps_svd_device);
+  if (mps_svd_device.compare("GPU") == 0) {
+    cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cuda_handle);
+  } else {
+    csvd_wrapper(C, U, S, V, mps_lapack);
+  }
   double discarded_value = 0.0;
   discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
                                  truncation_threshold_, mps_lapack);
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 0c8d26d7c9..820d8b29a2 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -551,18 +551,11 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) {
 }
 
 void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
-                  bool lapack, std::string mps_svd_device) {
-  if (mps_svd_device.compare("GPU") == 0) {
-#ifdef AER_THRUST_CUDA
-    cutensor_csvd_wrapper(A, U, S, V);
-#endif // AER_THRUST_CUDA
-
+                  bool lapack) {
+  if (lapack) {
+    lapack_csvd_wrapper(A, U, S, V);
   } else {
-    if (lapack) {
-      lapack_csvd_wrapper(A, U, S, V);
-    } else {
-      qiskit_csvd_wrapper(A, U, S, V);
-    }
+    qiskit_csvd_wrapper(A, U, S, V);
   }
 }
 
@@ -676,35 +669,9 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 }
 
 #ifdef AER_THRUST_CUDA
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cutensornet.h>
-#include <vector>
-
-#define HANDLE_ERROR(x)                                                        \
-  {                                                                            \
-    const auto err = x;                                                        \
-    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
-      std::stringstream str;                                                   \
-      str << "ERROR TensorNet::contractor : "                                  \
-          << cutensornetGetErrorString(err);                                   \
-      throw std::runtime_error(str.str());                                     \
-    }                                                                          \
-  };
-
-#define HANDLE_CUDA_ERROR(x)                                                   \
-  {                                                                            \
-    const auto err = x;                                                        \
-    if (err != cudaSuccess) {                                                  \
-      std::stringstream str;                                                   \
-      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
-      throw std::runtime_error(str.str());                                     \
-    }                                                                          \
-  };
-
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
-                           cmatrix_t &V) {
+                           cmatrix_t &V, cudaStream_t &stream,
+                           cutensornetHandle_t &handle) {
 
   bool transposed = false;
 
@@ -731,13 +698,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
             *cutensor_V = V.move_to_buffer();
 
-  const size_t cuTensornetVersion = cutensornetGetVersion();
-
-  cudaDeviceProp prop;
-  int deviceId{-1};
-  HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
-  HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
-
   cudaDataType_t typeData = CUDA_C_64F;
 
   std::vector<int32_t> modesA{'m', 'n'};
@@ -758,12 +718,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
   HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice));
 
-  cudaStream_t stream;
-  HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
-
-  cutensornetHandle_t handle;
-  HANDLE_ERROR(cutensornetCreate(&handle));
-
   cutensornetTensorDescriptor_t descTensorA;
   cutensornetTensorDescriptor_t descTensorU;
   cutensornetTensorDescriptor_t descTensorV;
@@ -851,8 +805,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
-  HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
-  HANDLE_ERROR(cutensornetDestroy(handle));
 
   if (cutensor_S)
     free(cutensor_S);
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index 9c44cfb83b..83cf769148 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -42,7 +42,7 @@ double reduce_zeros(cmatrix_t &U, rvector_t &S, cmatrix_t &V,
 status csvd(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V);
 // Entry point for the SVD calculation
 void csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
-                  bool lapack, std::string mps_svd_device);
+                  bool lapack);
 // Original qiskit call
 void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
                          cmatrix_t &V);
@@ -50,17 +50,46 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
 void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
                          cmatrix_t &V);
 
-#ifdef AER_THRUST_CUDA
-// cutensor call
-void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
-                           cmatrix_t &V);
-#endif // AER_THRUST_CUDA
-
 void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
                          const rvector_t &S, const cmatrix_t &V);
 void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
                           const rvector_t &S, const cmatrix_t &V);
 
+#ifdef AER_THRUST_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cutensornet.h>
+#include <vector>
+
+#define HANDLE_ERROR(x)                                                        \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : "                                  \
+          << cutensornetGetErrorString(err);                                   \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+// cutensor call
+void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
+                           cmatrix_t &V, cudaStream_t &stream,
+                           cutensornetHandle_t &handle);
+
+#endif // AER_THRUST_CUDA
+
 //-------------------------------------------------------------------------
 } // end namespace AER
 //-------------------------------------------------------------------------

From c24b9e2365ba281cd09c044862f3526b3817bac6 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 18 Jun 2024 01:18:28 +0000
Subject: [PATCH 18/27] refactor code

---
 .../matrix_product_state_internal.cpp             | 13 ++++---------
 .../matrix_product_state_internal.hpp             | 15 +--------------
 .../matrix_product_state_tensor.hpp               | 10 +++-------
 src/simulators/matrix_product_state/svd.cpp       |  8 ++++++++
 src/simulators/matrix_product_state/svd.hpp       |  3 +--
 5 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index c301121960..9df0e3220c 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -48,10 +48,6 @@ bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
 std::string MPS::mps_svd_device_;
 
-// if cuda is present.
-cutensornetHandle_t MPS::cuda_handle_ = NULL;
-cudaStream_t MPS::cuda_stream_ = NULL;
-
 //------------------------------------------------------------------------
 // local function declarations
 //------------------------------------------------------------------------
@@ -669,9 +665,9 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
-  double discarded_value = MPS_Tensor::Decompose(
-      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
-      MPS::mps_svd_device_, MPS::cuda_stream_, MPS::cuda_handle_);
+  double discarded_value =
+      MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma,
+                            MPS::mps_lapack_, MPS::mps_svd_device_);
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1812,8 +1808,7 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
 
     if (MPS::mps_svd_device_.compare("GPU") == 0) {
-      cutensor_csvd_wrapper(reshaped_matrix, U, S, V, MPS::cuda_stream_,
-                            MPS::cuda_handle_);
+      cutensor_csvd_wrapper(reshaped_matrix, U, S, V);
     } else {
       csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
     }
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 71e067dcc5..b3d1433b19 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -82,15 +82,7 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT };
 class MPS {
 public:
   MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {}
-  ~MPS() {
-    if (cuda_stream_) {
-      HANDLE_CUDA_ERROR(cudaStreamDestroy(cuda_stream_));
-    }
-
-    if (cuda_handle_) {
-      HANDLE_ERROR(cutensornetDestroy(cuda_handle_));
-    }
-  }
+  ~MPS() {}
 
   //--------------------------------------------------------------------------
   // Function name: initialize
@@ -340,9 +332,6 @@ class MPS {
     int deviceId{-1};
     HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
     HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
-
-    HANDLE_CUDA_ERROR(cudaStreamCreate(&MPS::cuda_stream_));
-    HANDLE_ERROR(cutensornetCreate(&MPS::cuda_handle_));
   }
 
   static uint_t get_omp_threads() { return omp_threads_; }
@@ -594,8 +583,6 @@ class MPS {
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
   static std::string mps_svd_device_;
-  static cutensornetHandle_t cuda_handle_;
-  static cudaStream_t cuda_stream_;
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index b57e7e0198..d8337f9aa6 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -159,9 +159,7 @@ class MPS_Tensor {
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack, std::string mps_svd_device,
-                          cudaStream_t cuda_stream,
-                          cutensornetHandle_t cuda_handle);
+                          bool mps_lapack, std::string mps_svd_device);
   static void reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> data,
                                               MPS_Tensor &reshaped_tensor);
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
@@ -594,16 +592,14 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //---------------------------------------------------------------
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack, std::string mps_svd_device,
-                             cudaStream_t cuda_stream,
-                             cutensornetHandle_t cuda_handle) {
+                             bool mps_lapack, std::string mps_svd_device) {
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
   if (mps_svd_device.compare("GPU") == 0) {
-    cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cuda_handle);
+    cutensor_csvd_wrapper(C, U, S, V);
   } else {
     csvd_wrapper(C, U, S, V, mps_lapack);
   }
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 820d8b29a2..36bad34d54 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -704,6 +704,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   std::vector<int32_t> modesU{'m', 'x'};
   std::vector<int32_t> modesV{'x', 'n'};
 
+  cudaStream_t stream;
+  HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
+
+  cutensornetHandle_t handle;
+  HANDLE_ERROR(cutensornetCreate(&handle));
+
   double *cutensor_S = (double *)malloc(sizeS);
 
   void *D_A;
@@ -805,6 +811,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+  HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
+  HANDLE_ERROR(cutensornetDestroy(handle));
 
   if (cutensor_S)
     free(cutensor_S);
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index 83cf769148..d116c89348 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -85,8 +85,7 @@ void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
 
 // cutensor call
 void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
-                           cmatrix_t &V, cudaStream_t &stream,
-                           cutensornetHandle_t &handle);
+                           cmatrix_t &V);
 
 #endif // AER_THRUST_CUDA
 

From 34e9502f8275302a8e2eabb663b6c098c6a17e0b Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 18 Jun 2024 02:21:50 +0000
Subject: [PATCH 19/27] refactor code

---
 src/simulators/matrix_product_state/svd.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 36bad34d54..33ad8a2c57 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -670,8 +670,7 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
 #ifdef AER_THRUST_CUDA
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
-                           cmatrix_t &V, cudaStream_t &stream,
-                           cutensornetHandle_t &handle) {
+                           cmatrix_t &V) {
 
   bool transposed = false;
 

From 00f88e980fa5dabb8b0df88d944c1a549e6254cb Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 18 Jun 2024 04:32:03 +0000
Subject: [PATCH 20/27] refactor code; included test

---
 .../matrix_product_state/matrix_product_state_internal.cpp     | 2 ++
 .../matrix_product_state/matrix_product_state_internal.hpp     | 2 ++
 .../matrix_product_state/matrix_product_state_tensor.hpp       | 2 ++
 src/simulators/matrix_product_state/svd.cpp                    | 3 ++-
 test/terra/backends/simulator_test_case.py                     | 2 +-
 5 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index 9df0e3220c..813f0236e9 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -1808,7 +1808,9 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
 
     if (MPS::mps_svd_device_.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
       cutensor_csvd_wrapper(reshaped_matrix, U, S, V);
+#endif // AER_THRUST_CUDA
     } else {
       csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
     }
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index b3d1433b19..7050eb44f8 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -328,10 +328,12 @@ class MPS {
   static void set_cuda_device() {
     // the prop could be used to log the properties of the device.
 
+#ifdef AER_THRUST_CUDA
     cudaDeviceProp prop;
     int deviceId{-1};
     HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
     HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+#endif // AER_THRUST_CUDA
   }
 
   static uint_t get_omp_threads() { return omp_threads_; }
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index d8337f9aa6..41b78cafee 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -599,7 +599,9 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
   if (mps_svd_device.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
     cutensor_csvd_wrapper(C, U, S, V);
+#endif // AER_THRUST_CUDA
   } else {
     csvd_wrapper(C, U, S, V, mps_lapack);
   }
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 33ad8a2c57..981506d351 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -797,7 +797,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U);
   V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
 
-  validate_SVdD_result(A_cpy, U, S, V);
+  V = AER::Utils::dagger(V);
+  validate_SVD_result(A_cpy, U, S, V);
   if (transposed) {
     std::swap(U, V);
   }
diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py
index 2173c2c413..1362b8a861 100644
--- a/test/terra/backends/simulator_test_case.py
+++ b/test/terra/backends/simulator_test_case.py
@@ -85,7 +85,7 @@ def _method_device(methods):
     # add special test device for cuStateVec if available
     cuStateVec = check_cuStateVec(available_devices)
 
-    gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network"]
+    gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network", "matrix_product_state"]
     batchable_methods = ["statevector", "density_matrix"]
     data_args = []
     for method in methods:

From 454f8c0075f329c46f2e3cf6542b64410d014c6b Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 18 Jun 2024 04:35:32 +0000
Subject: [PATCH 21/27] lint

---
 test/terra/backends/simulator_test_case.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py
index 1362b8a861..6c38c623c6 100644
--- a/test/terra/backends/simulator_test_case.py
+++ b/test/terra/backends/simulator_test_case.py
@@ -85,7 +85,13 @@ def _method_device(methods):
     # add special test device for cuStateVec if available
     cuStateVec = check_cuStateVec(available_devices)
 
-    gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network", "matrix_product_state"]
+    gpu_methods = [
+        "statevector",
+        "density_matrix",
+        "unitary",
+        "tensor_network",
+        "matrix_product_state",
+    ]
     batchable_methods = ["statevector", "density_matrix"]
     data_args = []
     for method in methods:

From 985c7f20935399ffa8a643731a36e1654f9b8343 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 18 Jun 2024 09:08:30 +0000
Subject: [PATCH 22/27] added suggestion

---
 test/terra/backends/simulator_test_case.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py
index 6c38c623c6..358fe942f7 100644
--- a/test/terra/backends/simulator_test_case.py
+++ b/test/terra/backends/simulator_test_case.py
@@ -111,7 +111,11 @@ def _method_device(methods):
                     # add test cases for cuStateVec if available using special device = 'GPU_cuStateVec'
                     #'GPU_cuStateVec' is used only inside tests not available in Aer
                     # and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added
-                    if cuStateVec and "tensor_network" != method:
+                    if (
+                        cuStateVec
+                        and "tensor_network" != method
+                        and "matrix_product_state" != method
+                    ):
                         data_args.append((method, "GPU_cuStateVec"))
             else:
                 data_args.append((method, "CPU"))

From 34a5e75f105fb7ece277cec982ff95bbba560be0 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Sat, 31 Aug 2024 13:48:32 +0000
Subject: [PATCH 23/27] fixed a typo

---
 .../matrix_product_state/matrix_product_state_tensor.hpp   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index 5b4a77e78c..256392e7df 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -160,9 +160,10 @@ class MPS_Tensor {
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
                           bool mps_lapack, std::string mps_svd_device);
-  
-  static void reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> data,
-                                              MPS_Tensor &reshaped_tensor);
+
+  static void
+  reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> &data,
+                                  MPS_Tensor &reshaped_tensor);
 
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
                                     const MPS_Tensor &right_gamma,

From a1ae3080fafaa1655a35aa1bf44303eaacd29306 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 10 Sep 2024 23:06:26 +0000
Subject: [PATCH 24/27] refactor code

---
 .../matrix_product_state.hpp                  |  7 ++++-
 .../matrix_product_state_internal.cpp         | 22 ++++++++++----
 .../matrix_product_state_internal.hpp         | 25 ++++++++--------
 .../matrix_product_state_tensor.hpp           | 30 +++++++++++++++----
 src/simulators/matrix_product_state/svd.cpp   | 11 ++-----
 src/simulators/matrix_product_state/svd.hpp   |  3 +-
 6 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index 0346b01f84..ae4f3bddd5 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -366,7 +366,12 @@ void State::set_config(const Config &config) {
 
   // Get CUDA device, if GPU offloading enabled
   if (config.device.compare("GPU") == 0) {
-    MPS::set_cuda_device();
+#ifdef AER_THRUST_CUDA
+    cudaDeviceProp prop;
+    int deviceId{-1};
+    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
+    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+#endif // AER_THRUST_CUDA
   }
 }
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index a7d5a121ec..c48a4d4c24 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -46,7 +46,9 @@ double MPS::json_chop_threshold_ = 1E-8;
 std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
+#ifdef AER_THRUST_CUDA
 std::string MPS::mps_svd_device_;
+#endif // AER_THRUST_CUDA
 
 //------------------------------------------------------------------------
 // local function declarations
@@ -665,9 +667,14 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
-  double discarded_value =
-      MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma,
-                            MPS::mps_lapack_, MPS::mps_svd_device_);
+#ifdef AER_THRUST_CUDA
+  double discarded_value = MPS_Tensor::Decompose(
+      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
+      MPS::mps_svd_device_, cuda_stream, cutensor_handle);
+#else
+  double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
+                                                 right_gamma, MPS::mps_lapack_);
+#endif // AER_THRUST_CUDA
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1807,13 +1814,16 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     S.clear();
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
 
-    if (MPS::mps_svd_device_.compare("GPU") == 0) {
 #ifdef AER_THRUST_CUDA
-      cutensor_csvd_wrapper(reshaped_matrix, U, S, V);
-#endif // AER_THRUST_CUDA
+    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+      cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream,
+                            cutensor_handle);
     } else {
       csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
     }
+#else
+    csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+#endif // AER_THRUST_CUDA
 
     reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
                  MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 7050eb44f8..599604fa8b 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -81,7 +81,14 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT };
 
 class MPS {
 public:
-  MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {}
+  MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {
+#ifdef AER_THRUST_CUDA
+    if (mps_svd_device_.compare("GPU") == 0) {
+      cudaStreamCreate(&cuda_stream);
+      cutensornetCreate(&cutensor_handle);
+    }
+#endif // AER_THRUST_CUDA
+  }
   ~MPS() {}
 
   //--------------------------------------------------------------------------
@@ -325,17 +332,6 @@ class MPS {
     mps_svd_device_ = mps_svd_device;
   }
 
-  static void set_cuda_device() {
-    // the prop could be used to log the properties of the device.
-
-#ifdef AER_THRUST_CUDA
-    cudaDeviceProp prop;
-    int deviceId{-1};
-    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
-    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
-#endif // AER_THRUST_CUDA
-  }
-
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
   static double get_json_chop_threshold() { return json_chop_threshold_; }
@@ -558,6 +554,11 @@ class MPS {
   std::vector<MPS_Tensor> q_reg_;
   std::vector<rvector_t> lambda_reg_;
 
+#ifdef AER_THRUST_CUDA
+  cudaStream_t cuda_stream;
+  cutensornetHandle_t cutensor_handle;
+#endif // AER_THRUST_CUDA
+
   struct ordering {
     // order_ stores the current ordering of the qubits,
     // location_ stores the location of each qubit in the vector. It is derived
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index 256392e7df..2886333b34 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -157,9 +157,17 @@ class MPS_Tensor {
   static MPS_Tensor contract(const MPS_Tensor &left_gamma,
                              const rvector_t &lambda,
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
+#ifdef AER_THRUST_CUDA
+  static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
+                          rvector_t &lambda, MPS_Tensor &right_gamma,
+                          bool mps_lapack, std::string mps_svd_device,
+                          cudaStream_t &cuda_stream,
+                          cutensornetHandle_t &cutensor_handle);
+#else
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack, std::string mps_svd_device);
+                          bool mps_lapack);
+#endif // AER_THRUST_CUDA
 
   static void
   reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> &data,
@@ -593,21 +601,33 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //             rvector_t &lambda - tensors for the result.
 // Returns: none.
 //---------------------------------------------------------------
+#ifdef AER_THRUST_CUDA
+double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
+                             rvector_t &lambda, MPS_Tensor &right_gamma,
+                             bool mps_lapack, std::string mps_svd_device,
+                             cudaStream_t &cuda_stream,
+                             cutensornetHandle_t &cutensor_handle)
+#else
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack, std::string mps_svd_device) {
+                             bool mps_lapack)
+#endif // AER_THRUST_CUDA
+{
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
-  if (mps_svd_device.compare("GPU") == 0) {
 #ifdef AER_THRUST_CUDA
-    cutensor_csvd_wrapper(C, U, S, V);
-#endif // AER_THRUST_CUDA
+  if (mps_svd_device.compare("GPU") == 0) {
+    cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle);
   } else {
     csvd_wrapper(C, U, S, V, mps_lapack);
   }
+#else
+  csvd_wrapper(C, U, S, V, mps_lapack);
+#endif // AER_THRUST_CUDA
+
   double discarded_value = 0.0;
   discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
                                  truncation_threshold_, mps_lapack);
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 8ba5f45af6..38c74b16a0 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -670,7 +670,8 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 
 #ifdef AER_THRUST_CUDA
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
-                           cmatrix_t &V) {
+                           cmatrix_t &V, cudaStream_t &stream,
+                           cutensornetHandle_t &handle) {
 
   bool transposed = false;
 
@@ -703,12 +704,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   std::vector<int32_t> modesU{'m', 'x'};
   std::vector<int32_t> modesV{'x', 'n'};
 
-  cudaStream_t stream;
-  HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
-
-  cutensornetHandle_t handle;
-  HANDLE_ERROR(cutensornetCreate(&handle));
-
   double *cutensor_S = (double *)malloc(sizeS);
 
   void *D_A;
@@ -811,8 +806,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
   HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
-  HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
-  HANDLE_ERROR(cutensornetDestroy(handle));
 
   if (cutensor_S)
     free(cutensor_S);
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
index 9594d597f1..69e2518462 100644
--- a/src/simulators/matrix_product_state/svd.hpp
+++ b/src/simulators/matrix_product_state/svd.hpp
@@ -85,7 +85,8 @@ void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
 
 // cutensor call
 void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
-                           cmatrix_t &V);
+                           cmatrix_t &V, cudaStream_t &stream,
+                           cutensornetHandle_t &handle);
 
 #endif // AER_THRUST_CUDA
 

From ed9a907c6c34688564cbaec1f0a96c3c0f99dfe8 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Mon, 11 Nov 2024 11:30:20 +0000
Subject: [PATCH 25/27] refactor code

---
 .../matrix_product_state/matrix_product_state.hpp | 10 +---------
 .../matrix_product_state_internal.hpp             | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index ae4f3bddd5..9de6b405ae 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -362,17 +362,9 @@ void State::set_config(const Config &config) {
   MPS::set_mps_lapack_svd(config.mps_lapack);
 
   // Set device for SVD
-  MPS::set_mps_svd_device(config.device);
-
-  // Get CUDA device, if GPU offloading enabled
-  if (config.device.compare("GPU") == 0) {
 #ifdef AER_THRUST_CUDA
-    cudaDeviceProp prop;
-    int deviceId{-1};
-    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
-    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+  MPS::set_mps_svd_device(config.device);
 #endif // AER_THRUST_CUDA
-  }
 }
 
 void State::add_metadata(ExperimentResult &result) const {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 599604fa8b..548fe31611 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -83,13 +83,22 @@ class MPS {
 public:
   MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {
 #ifdef AER_THRUST_CUDA
+    cuda_stream = NULL;
+    cutensor_handle = NULL;
     if (mps_svd_device_.compare("GPU") == 0) {
       cudaStreamCreate(&cuda_stream);
       cutensornetCreate(&cutensor_handle);
     }
 #endif // AER_THRUST_CUDA
   }
-  ~MPS() {}
+  ~MPS() {
+#ifdef AER_THRUST_CUDA
+    if (cutensor_handle)
+      cutensornetDestroy(cutensor_handle);
+    if (cuda_stream)
+      cudaStreamDestroy(cuda_stream);
+#endif // AER_THRUST_CUDA
+  }
 
   //--------------------------------------------------------------------------
   // Function name: initialize
@@ -328,9 +337,11 @@ class MPS {
   }
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
+#ifdef AER_THRUST_CUDA
   static void set_mps_svd_device(std::string mps_svd_device) {
     mps_svd_device_ = mps_svd_device;
   }
+#endif // AER_THRUST_CUDA
 
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
@@ -585,7 +596,9 @@ class MPS {
   static bool mps_log_data_;
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
+#ifdef AER_THRUST_CUDA
   static std::string mps_svd_device_;
+#endif // AER_THRUST_CUDA
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {

From e1e80d1effad2d86eea052c5b9324e9eeae8a0e9 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Thu, 28 Nov 2024 06:43:03 +0000
Subject: [PATCH 26/27] added cublas for contract

---
 .../matrix_product_state.hpp                  |   2 +-
 .../matrix_product_state_internal.cpp         |  37 +++-
 .../matrix_product_state_internal.hpp         |  14 +-
 .../matrix_product_state_tensor.hpp           | 167 +++++++++++++++++-
 src/simulators/matrix_product_state/svd.cpp   |   1 -
 5 files changed, 202 insertions(+), 19 deletions(-)

diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index 9de6b405ae..1e4aafc1aa 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -363,7 +363,7 @@ void State::set_config(const Config &config) {
 
   // Set device for SVD
 #ifdef AER_THRUST_CUDA
-  MPS::set_mps_svd_device(config.device);
+  MPS::set_mps_device(config.device);
 #endif // AER_THRUST_CUDA
 }
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index c48a4d4c24..8df59d349c 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -47,7 +47,7 @@ std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
 #ifdef AER_THRUST_CUDA
-std::string MPS::mps_svd_device_;
+std::string MPS::mps_device_;
 #endif // AER_THRUST_CUDA
 
 //------------------------------------------------------------------------
@@ -631,8 +631,14 @@ void MPS::common_apply_2_qubit_gate(
   if (A + 1 != num_qubits_ - 1)
     q_reg_[A + 1].mul_Gamma_by_right_Lambda(lambda_reg_[A + 1]);
 
+#ifdef AER_THRUST_CUDA
+  MPS_Tensor temp =
+      MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1],
+                           MPS::mps_device_, num_qubits_, cublas_handle);
+#else
   MPS_Tensor temp =
       MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1]);
+#endif // AER_THRUST_CUDA
 
   switch (gate_type) {
   case cx:
@@ -669,8 +675,8 @@ void MPS::common_apply_2_qubit_gate(
   rvector_t lambda;
 #ifdef AER_THRUST_CUDA
   double discarded_value = MPS_Tensor::Decompose(
-      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
-      MPS::mps_svd_device_, cuda_stream, cutensor_handle);
+      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, MPS::mps_device_,
+      num_qubits_, cuda_stream, cutensor_handle);
 #else
   double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
                                                  right_gamma, MPS::mps_lapack_);
@@ -1301,6 +1307,7 @@ MPS_Tensor MPS::state_vec_as_MPS(const reg_t &qubits) {
 }
 
 MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const {
+
   MPS_Tensor temp = q_reg_[first_index];
 
   if (first_index != 0)
@@ -1313,8 +1320,14 @@ MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const {
   }
 
   for (uint_t i = first_index + 1; i < last_index + 1; i++) {
+#ifdef AER_THRUST_CUDA
+    temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i],
+                                MPS::mps_device_, num_qubits_, cublas_handle);
+#else
     temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i]);
+#endif // AER_THRUST_CUDA
   }
+
   // now temp is a tensor of 2^n matrices
   if (last_index != num_qubits_ - 1)
     temp.mul_Gamma_by_right_Lambda(lambda_reg_[last_index]);
@@ -1815,7 +1828,23 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
 
 #ifdef AER_THRUST_CUDA
-    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+    // Before offloading to the GPU, we need to make sure the size of
+    // matrices to be operated on should be big enough to make
+    // the time taken by the initialization of `cutensornet` worth it.
+    // Size of matrices involved for qubits < 13 is too small to be
+    // considered to offload to the GPU.
+    // Even if num_qubits involved in the operation is > 13, still
+    // a lot of the matrices are too small to be offloaded, so to determine
+    // the size of matrices to offload, we can look at the number of elements
+    // it has. The number of elements `8401` and num_qubits > 13 has been
+    // obtained not on any theoritical basis, but on trial and error.
+    // The number of elements and number of qubits depends solely on
+    // difference between the speed of CPU and GPU involved. Even if matrices
+    // are big, they are not big enough to make speed of PICe a bottleneck.
+    // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
+    if ((num_qubits_ > 13) && (MPS::mps_device_.compare("GPU") == 0) &&
+        ((reshaped_matrix.GetRows() * reshaped_matrix.GetColumns()) > 8401)) {
+
       cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream,
                             cutensor_handle);
     } else {
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
index 548fe31611..5275fc2df7 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -85,9 +85,12 @@ class MPS {
 #ifdef AER_THRUST_CUDA
     cuda_stream = NULL;
     cutensor_handle = NULL;
-    if (mps_svd_device_.compare("GPU") == 0) {
+    cublas_handle = NULL;
+    if (mps_device_.compare("GPU") == 0) {
       cudaStreamCreate(&cuda_stream);
       cutensornetCreate(&cutensor_handle);
+      cublasCreate(&cublas_handle);
+      cublasSetStream(cublas_handle, cuda_stream);
     }
 #endif // AER_THRUST_CUDA
   }
@@ -95,6 +98,8 @@ class MPS {
 #ifdef AER_THRUST_CUDA
     if (cutensor_handle)
       cutensornetDestroy(cutensor_handle);
+    if (cublas_handle)
+      cublasDestroy(cublas_handle);
     if (cuda_stream)
       cudaStreamDestroy(cuda_stream);
 #endif // AER_THRUST_CUDA
@@ -338,8 +343,8 @@ class MPS {
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
 #ifdef AER_THRUST_CUDA
-  static void set_mps_svd_device(std::string mps_svd_device) {
-    mps_svd_device_ = mps_svd_device;
+  static void set_mps_device(std::string mps_device) {
+    mps_device_ = mps_device;
   }
 #endif // AER_THRUST_CUDA
 
@@ -568,6 +573,7 @@ class MPS {
 #ifdef AER_THRUST_CUDA
   cudaStream_t cuda_stream;
   cutensornetHandle_t cutensor_handle;
+  cublasHandle_t cublas_handle;
 #endif // AER_THRUST_CUDA
 
   struct ordering {
@@ -597,7 +603,7 @@ class MPS {
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
 #ifdef AER_THRUST_CUDA
-  static std::string mps_svd_device_;
+  static std::string mps_device_;
 #endif // AER_THRUST_CUDA
 };
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index 2886333b34..31aae8e347 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -15,6 +15,19 @@
 #ifndef _tensor_tensor_hpp_
 #define _tensor_tensor_hpp_
 
+#ifdef AER_THRUST_CUDA
+#include <cublas_v2.h>
+#define HANDLE_CUBLAS_ERROR(x)                                                 \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUBLAS_STATUS_SUCCESS) {                                        \
+      printf("Error: %s in line %d/n", cublasGetStatusString(err), __LINE__);  \
+      fflush(stdout);                                                          \
+    }                                                                          \
+  };
+
+#endif // AER_THRUST_CUDA
+
 #include <complex>
 #include <cstdio>
 #include <exception>
@@ -154,19 +167,33 @@ class MPS_Tensor {
   void mul_Gamma_by_right_Lambda(const rvector_t &Lambda);
   void div_Gamma_by_left_Lambda(const rvector_t &Lambda);
   void div_Gamma_by_right_Lambda(const rvector_t &Lambda);
-  static MPS_Tensor contract(const MPS_Tensor &left_gamma,
-                             const rvector_t &lambda,
-                             const MPS_Tensor &right_gamma, bool mul_by_lambda);
+
 #ifdef AER_THRUST_CUDA
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack, std::string mps_svd_device,
-                          cudaStream_t &cuda_stream,
+                          bool mps_lapack, std::string mps_device,
+                          uint_t num_qubits, cudaStream_t &cuda_stream,
                           cutensornetHandle_t &cutensor_handle);
+
+  static MPS_Tensor contract(const MPS_Tensor &left_gamma,
+                             const rvector_t &lambda,
+                             const MPS_Tensor &right_gamma,
+                             std::string mps_device, uint_t num_qubits,
+                             const cublasHandle_t &cublas_handle,
+                             bool mul_by_lambda);
+
+  static void cublas_Zgemm_(cmatrix_t &A, cmatrix_t &B, cmatrix_t &C,
+                            const cublasHandle_t &cublas_handle);
+
 #else
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
                           bool mps_lapack);
+
+  static MPS_Tensor contract(const MPS_Tensor &left_gamma,
+                             const rvector_t &lambda,
+                             const MPS_Tensor &right_gamma, bool mul_by_lambda);
+
 #endif // AER_THRUST_CUDA
 
   static void
@@ -503,6 +530,65 @@ void MPS_Tensor::mul_Gamma_by_Lambda(const rvector_t &Lambda,
 // 	       tensors to contract.
 // Returns: The result tensor of the contract
 //---------------------------------------------------------------
+#ifdef AER_THRUST_CUDA
+MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
+                                const rvector_t &lambda,
+                                const MPS_Tensor &right_gamma,
+                                std::string mps_device, uint_t num_qubits,
+                                const cublasHandle_t &cublas_handle,
+                                bool mul_by_lambda = true) {
+  MPS_Tensor Res;
+  MPS_Tensor new_left = left_gamma;
+  if (mul_by_lambda) {
+    new_left.mul_Gamma_by_right_Lambda(lambda);
+  }
+
+  for (uint_t idx1 = 0; idx1 < new_left.data_.size(); idx1++) {
+    for (uint_t idx2 = 0; idx2 < right_gamma.data_.size(); idx2++) {
+
+      // Before offloading to the GPU, we need to make sure the size of
+      // matrices to be operated on should be big enough to make the time
+      // taken by the initialization of involved GPU routines worth it.
+      // Size of matrices involved for qubits < 16 is too small to be
+      // considered to offload to the GPU.
+      // Even if num_qubits involved in the operation is > 16, still
+      // a lot of the matrices are too small to be offloaded.
+      // Matrices with rows/cols > 128 when offloaded to GPU for multiplication
+      // show speed up when compared to the operation done on the CPU.
+      // This observation was obtained on trial and error, and, does not have
+      // any theoritical basis.
+      // The number of rows/cols and number of qubits depends solely on
+      // difference between the speed of CPU and GPU involved. Even if matrices
+      // are big, they are not big enough to make speed of PICe a bottleneck.
+      // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
+
+      if ((num_qubits > 16) && (mps_device.compare("GPU") == 0)) {
+        cmatrix_t mat1 = new_left.data_[idx1], mat2 = right_gamma.data_[idx2];
+
+        uint_t mat1_rows = mat1.GetRows(), mat1_cols = mat1.GetColumns(),
+               mat2_rows = mat2.GetRows(), mat2_cols = mat2.GetColumns();
+
+        // If the mps_device is set to `GPU`, also need to make
+        // sure if the matrix is large enough to make it worth
+        // invoking needed GPU routines.
+        // If the matris is not big enough, the multiplication
+        // will be done on CPU using openblas zgemm_ routine.
+        if ((mat1_rows > 128) && (mat1_cols > 128) && (mat2_cols > 128)) {
+          cmatrix_t mat1_mat2(mat1_rows, mat2_cols);
+          cublas_Zgemm_(mat1, mat2, mat1_mat2, cublas_handle);
+          Res.data_.push_back(mat1_mat2);
+        } else {
+          Res.data_.push_back(mat1 * mat2);
+        }
+      } else {
+        Res.data_.push_back(new_left.data_[idx1] * right_gamma.data_[idx2]);
+      }
+    }
+  }
+
+  return Res;
+}
+#else
 MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
                                 const rvector_t &lambda,
                                 const MPS_Tensor &right_gamma,
@@ -514,11 +600,11 @@ MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
   }
   for (uint_t i = 0; i < new_left.data_.size(); i++)
     for (uint_t j = 0; j < right_gamma.data_.size(); j++) {
-
       Res.data_.push_back(new_left.data_[i] * right_gamma.data_[j]);
     }
   return Res;
 }
+#endif
 
 //---------------------------------------------------------------
 // Function name: contract_2_dimensions
@@ -604,8 +690,8 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 #ifdef AER_THRUST_CUDA
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack, std::string mps_svd_device,
-                             cudaStream_t &cuda_stream,
+                             bool mps_lapack, std::string mps_device,
+                             uint_t num_qubits, cudaStream_t &cuda_stream,
                              cutensornetHandle_t &cutensor_handle)
 #else
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
@@ -619,7 +705,22 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
 #ifdef AER_THRUST_CUDA
-  if (mps_svd_device.compare("GPU") == 0) {
+  // Before offloading to the GPU, we need to make sure the size of
+  // matrices to be operated on should be big enough to make
+  // the time taken by the initialization of `cutensornet` worth it.
+  // Size of matrices involved for qubits < 13 is too small to be
+  // considered to offload to the GPU.
+  // Even if num_qubits involved in the operation is > 13, still
+  // a lot of the matrices are too small to be offloaded, so to determine
+  // the size of matrices to offload, we can look at the number of elements
+  // it has. The number of elements `8401` and num_qubits > 13 has been
+  // obtained not on any theoritical basis, but on trial and error.
+  // The number of elements and number of qubits depends solely on
+  // difference between the speed of CPU and GPU involved. Even if matrices
+  // are big, they are not big enough to make speed of PICe a bottleneck.
+  // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
+  if ((num_qubits > 13) && (mps_device.compare("GPU") == 0) &&
+      ((C.GetRows() * C.GetColumns()) > 8401)) {
     cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle);
   } else {
     csvd_wrapper(C, U, S, V, mps_lapack);
@@ -660,6 +761,54 @@ void MPS_Tensor::reshape_for_3_qubits_before_SVD(
   reshaped_tensor = MPS_Tensor(new_data_vector);
 }
 
+#ifdef AER_THRUST_CUDA
+void MPS_Tensor::cublas_Zgemm_(cmatrix_t &A, cmatrix_t &B, cmatrix_t &C,
+                               const cublasHandle_t &cublas_handle) {
+  size_t A_rows = A.GetRows(), A_cols = A.GetColumns(), B_rows = B.GetRows(),
+         B_cols = B.GetColumns(),
+         size_cuDoubleComplex = sizeof(cuDoubleComplex);
+
+  size_t size_A = A_rows * A_cols * size_cuDoubleComplex,
+         size_B = B_rows * B_cols * size_cuDoubleComplex,
+         size_C = A_rows * B_cols * size_cuDoubleComplex;
+
+  cuDoubleComplex *A_data_dev, *B_data_dev, *C_data_dev;
+
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&A_data_dev, size_A));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&B_data_dev, size_B));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&C_data_dev, size_C));
+
+  complex_t *A_data = A.move_to_buffer(), *B_data = B.move_to_buffer();
+
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(A_data_dev, A_data, size_A, cudaMemcpyHostToDevice));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(B_data_dev, B_data, size_B, cudaMemcpyHostToDevice));
+
+  A = std::move(cmatrix_t::move_from_buffer(A_rows, A_cols, A_data));
+  B = std::move(cmatrix_t::move_from_buffer(B_rows, B_cols, B_data));
+
+  cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0),
+                  beta = make_cuDoubleComplex(0.0, 0.0);
+
+  cublasZgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, A_rows, B_cols, A_cols,
+              &alpha, reinterpret_cast<const cuDoubleComplex *>(A_data_dev),
+              A_rows, reinterpret_cast<const cuDoubleComplex *>(B_data_dev),
+              B_rows, &beta, C_data_dev, A_rows);
+
+  complex_t *C_res = reinterpret_cast<complex_t *>(malloc(size_C));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(C_res, C_data_dev, size_C, cudaMemcpyDeviceToHost));
+  C = std::move(cmatrix_t::copy_from_buffer(A_rows, B_cols, C_res));
+
+  HANDLE_CUDA_ERROR(cudaFree(A_data_dev));
+  HANDLE_CUDA_ERROR(cudaFree(B_data_dev));
+  HANDLE_CUDA_ERROR(cudaFree(C_data_dev));
+  if (C_res)
+    free(C_res);
+}
+#endif // AER_THRUST_CUDA
+
 //-------------------------------------------------------------------------
 } // end namespace MatrixProductState
 //-------------------------------------------------------------------------
diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
index 38c74b16a0..313fe417bf 100644
--- a/src/simulators/matrix_product_state/svd.cpp
+++ b/src/simulators/matrix_product_state/svd.cpp
@@ -672,7 +672,6 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
                            cmatrix_t &V, cudaStream_t &stream,
                            cutensornetHandle_t &handle) {
-
   bool transposed = false;
 
   const int64_t rows = A.GetRows(), cols = A.GetColumns();

From 321230c77932e336c72fa35f8a9c72168c08c3c4 Mon Sep 17 00:00:00 2001
From: MozammilQ <qsigmozammil@gmail.com>
Date: Tue, 3 Dec 2024 08:01:12 +0000
Subject: [PATCH 27/27] fixed typo

---
 .../mps-svd-with-cuquantum-c0392854d1f373e0.yaml     | 12 +++++++-----
 .../matrix_product_state_internal.cpp                |  2 +-
 .../matrix_product_state_tensor.hpp                  |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
index 8e065ad484..17a7125fc9 100644
--- a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
+++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
@@ -2,11 +2,13 @@
 features:
   - |
     This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.
-    To be precise, this PR offloads the Singular Value Decomposition required for
-    Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.
-    
-    While choosing for the backend for Matrix Product State simulation users can 
-    choose all as usual, but this time they can choose the device as GPU.
+
+    While choosing for the backend for Matrix Product State simulation users can
+    choose all as usual, but this time when they can choose the device as GPU, the
+    simulation is accelerated by NVIDIA GPU. To be precise, this PR offloads the
+    Singular Value Decomposition and matrix multiplication of bigger matrices involved
+    in Matrix Product State Simulation to GPU.
+
 
     Example
 
diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
index 8df59d349c..abb98f551a 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -1837,7 +1837,7 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     // a lot of the matrices are too small to be offloaded, so to determine
     // the size of matrices to offload, we can look at the number of elements
     // it has. The number of elements `8401` and num_qubits > 13 has been
-    // obtained not on any theoritical basis, but on trial and error.
+    // obtained not on any theoretical basis, but on trial and error.
     // The number of elements and number of qubits depends solely on
     // difference between the speed of CPU and GPU involved. Even if matrices
     // are big, they are not big enough to make speed of PICe a bottleneck.
diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
index 31aae8e347..d81026c0e2 100644
--- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -556,7 +556,7 @@ MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
       // Matrices with rows/cols > 128 when offloaded to GPU for multiplication
       // show speed up when compared to the operation done on the CPU.
       // This observation was obtained on trial and error, and, does not have
-      // any theoritical basis.
+      // any theoretical basis.
       // The number of rows/cols and number of qubits depends solely on
       // difference between the speed of CPU and GPU involved. Even if matrices
       // are big, they are not big enough to make speed of PICe a bottleneck.
@@ -714,7 +714,7 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
   // a lot of the matrices are too small to be offloaded, so to determine
   // the size of matrices to offload, we can look at the number of elements
   // it has. The number of elements `8401` and num_qubits > 13 has been
-  // obtained not on any theoritical basis, but on trial and error.
+  // obtained not on any theoretical basis, but on trial and error.
   // The number of elements and number of qubits depends solely on
   // difference between the speed of CPU and GPU involved. Even if matrices
   // are big, they are not big enough to make speed of PICe a bottleneck.