From 563ae6ef14635c8f4777d7f7089f545520b86265 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Wed, 5 Jun 2024 22:35:13 +0000 Subject: [PATCH 01/27] initial layout --- src/simulators/matrix_product_state/svd.cpp | 277 +++++++++++++++++++- 1 file changed, 275 insertions(+), 2 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 150d97af4f..7d886cae2e 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -28,6 +28,37 @@ #include namespace AER { + + +#ifdef AER_THRUST_CUDA + +#include +#include +#include +#include +#include + + +#define HANDLE_ERROR(x) \ +{ const auto err = x; \ +if( err != CUTENSORNET_STATUS_SUCCESS ) \ +{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \ +}; + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \ +}; + + +#endif // AER_THRUST_CUDA + + + + + + // default values constexpr auto mul_factor = 1e2; constexpr long double tiny_factor = 1e30; @@ -552,9 +583,23 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V, bool lapack) { if (lapack) { - lapack_csvd_wrapper(A, U, S, V); + + #ifdef AER_THRUST_CUDA + cutensor_csvd_wrapper(A, U, S, V); + #endif // AER_THRUST_CUDA + + #ifndef AER_THRUST_CUDA + lapack_csvd_wrapper(A, U, S, V); + #endif // AER_THRUST_CUDA } else { - qiskit_csvd_wrapper(A, U, S, V); + + #ifdef AER_THRUST_CUDA + cutensor_csvd_wrapper(A, U, S, V); + #endif // AER_THRUST_CUDA + + #ifndef AER_THRUST_CUDA + qiskit_csvd_wrapper(A, U, S, V); + #endif // AER_THRUST_CUDA } } @@ -667,4 +712,232 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } } +#ifdef AER_THRUST_CUDA +void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) +{ + + const size_t cuTensornetVersion = cutensornetGetVersion(); + + cudaDeviceProp prop; + int deviceId{-1}; + HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) ); + HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) ); + + typedef float floatType; + cudaDataType_t typeData = CUDA_R_32F; + + std::vector modesT{'i', 'j'}; // input + std::vector modesU{'i', 'm'}; + std::vector modesV{'n', 'j'}; // SVD output + + + size_t elementsT = 160000; + size_t elementsU = 160000; + size_t elementsS = 400; + size_t elementsV = 160000; + + size_t sizeT = sizeof(floatType) * elementsT; + size_t sizeU = sizeof(floatType) * elementsU; + size_t sizeS = sizeof(floatType) * elementsS; + size_t sizeV = sizeof(floatType) * elementsS; + + floatType *T = (floatType*) malloc(sizeT); + floatType *U = (floatType*) malloc(sizeU); + floatType *S = (floatType*) malloc(sizeS); + floatType *V = (floatType*) malloc(sizeV); + + + void* D_T; + void* D_U; + void* D_S; + void* D_V; + + HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) ); + HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) ); + HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) ); + HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) ); + + HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) ); + + cudaStream_t stream; + HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) ); + + cutensornetHandle_t handle; + HANDLE_ERROR( cutensornetCreate(&handle) ); + + cutensornetTensorDescriptor_t descTensorIn; + cutensornetTensorDescriptor_t descTensorU; + cutensornetTensorDescriptor_t descTensorV; + + const int32_t numModesIn = modesT.size(); + const int32_t numModesU = modesU.size(); + const int32_t numModesV = modesV.size(); + + std::vector extentT{400, 400}; // shape of T + std::vector extentU{400, 400}; // shape of U + std::vector extentS{400}; // shape of S + std::vector extentV{400, 400}; // shape of V + + const int64_t* strides = NULL; // assuming fortran layout for all tensors + + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) ); + + cutensornetTensorSVDConfig_t svdConfig; + HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) ); + + // set up truncation parameters + double absCutoff = 1e-2; + HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, + svdConfig, + CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, + &absCutoff, + sizeof(absCutoff)) ); + double relCutoff = 4e-2; + HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, + svdConfig, + CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, + &relCutoff, + sizeof(relCutoff)) ); + + // optional: choose gesvdj algorithm with customized parameters. Default is gesvd. + cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, + svdConfig, + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, + &svdAlgo, + sizeof(svdAlgo)) ); + cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80}; + HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, + svdConfig, + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, + &gesvdjParams, + sizeof(gesvdjParams)) ); + + /******************************************************** + * Create SVDInfo to record runtime SVD truncation details + *********************************************************/ + + cutensornetTensorSVDInfo_t svdInfo; + HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ; + + /************************************************************** + * Query the required workspace sizes and allocate memory + **************************************************************/ + + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) ); + HANDLE_ERROR( cutensornetWorkspaceComputeSVDSizes(handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc) ); + int64_t hostWorkspaceSize, deviceWorkspaceSize; + // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick + HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &deviceWorkspaceSize) ); + HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_HOST, + CUTENSORNET_WORKSPACE_SCRATCH, + &hostWorkspaceSize) ); + + void *devWork = nullptr, *hostWork = nullptr; + if (deviceWorkspaceSize > 0) { + HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) ); + } + if (hostWorkspaceSize > 0) { + hostWork = malloc(hostWorkspaceSize); + } + HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle, + workDesc, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + devWork, + deviceWorkspaceSize) ); + HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle, + workDesc, + CUTENSORNET_MEMSPACE_HOST, + CUTENSORNET_WORKSPACE_SCRATCH, + hostWork, + hostWorkspaceSize) ); + + /********** + * Execution + ***********/ + + const int numRuns = 3; // to get stable perf results + for (int i=0; i < numRuns; ++i) + { + // restore output + cudaMemsetAsync(D_U, 0, sizeU, stream); + cudaMemsetAsync(D_S, 0, sizeS, stream); + cudaMemsetAsync(D_V, 0, sizeV, stream); + cudaDeviceSynchronize(); + + // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V. + // We here restore descTensorU/V to the original problem. + HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) ); + HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) ); + + HANDLE_ERROR( cutensornetTensorSVD(handle, + descTensorIn, D_T, + descTensorU, D_U, + D_S, + descTensorV, D_V, + svdConfig, + svdInfo, + workDesc, + stream) ); + // Synchronize and measure timing + } + + + HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) ); + HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) ); + HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) ); + + /************************************* + * Query runtime truncation information + **************************************/ + + double discardedWeight{0}; + int64_t reducedExtent{0}; + cutensornetGesvdjStatus_t gesvdjStatus; + cudaDeviceSynchronize(); // device synchronization. + HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, &discardedWeight, sizeof(discardedWeight)) ); + HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, &reducedExtent, sizeof(reducedExtent)) ); + HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, sizeof(gesvdjStatus)) ); + + /*************** + * Free resources + ****************/ + + HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) ); + HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) ); + HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) ); + HANDLE_ERROR( cutensornetDestroyTensorSVDConfig(svdConfig) ); + HANDLE_ERROR( cutensornetDestroyTensorSVDInfo(svdInfo) ); + HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) ); + HANDLE_ERROR( cutensornetDestroy(handle) ); + + if (T) free(T); + if (U) free(U); + if (S) free(S); + if (V) free(V); + if (D_T) cudaFree(D_T); + if (D_U) cudaFree(D_U); + if (D_S) cudaFree(D_S); + if (D_V) cudaFree(D_V); + if (devWork) cudaFree(devWork); + if (hostWork) free(hostWork); + +} +#endif // AER_THRUST_CUDA + + } // namespace AER From 280f868a201cf9fa0c32dd978a6e630cb8888794 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Wed, 5 Jun 2024 23:05:05 +0000 Subject: [PATCH 02/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 57 ++++++++++----------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 7d886cae2e..ff16793cd6 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -39,17 +39,26 @@ namespace AER { #include -#define HANDLE_ERROR(x) \ -{ const auto err = x; \ -if( err != CUTENSORNET_STATUS_SUCCESS ) \ -{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \ -}; - -#define HANDLE_CUDA_ERROR(x) \ -{ const auto err = x; \ - if( err != cudaSuccess ) \ - { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \ -}; +#define HANDLE_ERROR(x) \ + { \ + const auto err = x; \ + if (err != CUTENSORNET_STATUS_SUCCESS) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " \ + << cutensornetGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +#define HANDLE_CUDA_ERROR(x) \ + { \ + const auto err = x; \ + if (err != cudaSuccess) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; #endif // AER_THRUST_CUDA @@ -712,7 +721,7 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } } -#ifdef AER_THRUST_CUDA +//#ifdef AER_THRUST_CUDA void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { @@ -736,28 +745,22 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & size_t elementsS = 400; size_t elementsV = 160000; - size_t sizeT = sizeof(floatType) * elementsT; - size_t sizeU = sizeof(floatType) * elementsU; - size_t sizeS = sizeof(floatType) * elementsS; - size_t sizeV = sizeof(floatType) * elementsS; - - floatType *T = (floatType*) malloc(sizeT); - floatType *U = (floatType*) malloc(sizeU); - floatType *S = (floatType*) malloc(sizeS); - floatType *V = (floatType*) malloc(sizeV); - + size_t sizeA = sizeof(A); + size_t sizeU = sizeof(U); + size_t sizeS = sizeof(S); + size_t sizeV = sizeof(V); void* D_T; void* D_U; void* D_S; void* D_V; - HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) ); + HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeA) ); HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) ); HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) ); HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) ); - HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) ); + HANDLE_CUDA_ERROR( cudaMemcpy(D_T, A, sizeA, cudaMemcpyHostToDevice) ); cudaStream_t stream; HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) ); @@ -925,10 +928,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) ); HANDLE_ERROR( cutensornetDestroy(handle) ); - if (T) free(T); - if (U) free(U); - if (S) free(S); - if (V) free(V); if (D_T) cudaFree(D_T); if (D_U) cudaFree(D_U); if (D_S) cudaFree(D_S); @@ -937,7 +936,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & if (hostWork) free(hostWork); } -#endif // AER_THRUST_CUDA +//#endif // AER_THRUST_CUDA } // namespace AER From ae44c69f2623cc0ade2563e6987ed652b23f9db9 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 05:06:06 +0000 Subject: [PATCH 03/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 31 +++++++++++++++++---- src/simulators/matrix_product_state/svd.hpp | 3 ++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index ff16793cd6..7db0ef108d 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -724,8 +724,17 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, //#ifdef AER_THRUST_CUDA void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { - - const size_t cuTensornetVersion = cutensornetGetVersion(); + const size_t m = A.GetRows(), n = A.GetColumns(); + const size_t min_dim = std::min(m, n); + const size_t lda = std::max(m, n); + size_t lwork = 2 * min_dim + lda; + + U.resize(m, m); + V.resize(n, n); + + complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer(); + + const size_t cuTensornetVersion = cutensornetGetVersion(); cudaDeviceProp prop; int deviceId{-1}; @@ -750,6 +759,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & size_t sizeS = sizeof(S); size_t sizeV = sizeof(V); + double *cutensor_S = (double*)malloc(sizeof(S)); + void* D_T; void* D_U; void* D_S; @@ -760,7 +771,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) ); HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) ); - HANDLE_CUDA_ERROR( cudaMemcpy(D_T, A, sizeA, cudaMemcpyHostToDevice) ); + HANDLE_CUDA_ERROR( cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice) ); cudaStream_t stream; HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) ); @@ -900,9 +911,17 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & } - HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) ); - HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) ); - HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) ); + HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost) ); + HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost) ); + HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost) ); + + S.clear(); + for (int i = 0; i < min_dim; i++) + S.push_back(cutensor_S[i]); + + A = cmatrix_t::move_from_buffer(m, n, cutensor_A); + U = cmatrix_t::move_from_buffer(m, m, cutensor_U); + V = cmatrix_t::move_from_buffer(n, n, cutensor_V); /************************************* * Query runtime truncation information diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index fac77797fb..c35a9c4406 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -49,6 +49,9 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, // Lapack call void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); +// cutensor call +void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, + cmatrix_t &V); void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U, const rvector_t &S, const cmatrix_t &V); From 5b4826551a5623ebc8dae11917b25e7806c50ab9 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 05:43:10 +0000 Subject: [PATCH 04/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 28 ++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 7db0ef108d..2b73212f1f 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -724,10 +724,10 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, //#ifdef AER_THRUST_CUDA void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { - const size_t m = A.GetRows(), n = A.GetColumns(); - const size_t min_dim = std::min(m, n); - const size_t lda = std::max(m, n); - size_t lwork = 2 * min_dim + lda; + const int64_t m = A.GetRows(), n = A.GetColumns(); + const int64_t min_dim = std::min(m, n); + const int64_t lda = std::max(m, n); + int64_t lwork = 2 * min_dim + lda; U.resize(m, m); V.resize(n, n); @@ -744,15 +744,9 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & typedef float floatType; cudaDataType_t typeData = CUDA_R_32F; - std::vector modesT{'i', 'j'}; // input - std::vector modesU{'i', 'm'}; - std::vector modesV{'n', 'j'}; // SVD output - - - size_t elementsT = 160000; - size_t elementsU = 160000; - size_t elementsS = 400; - size_t elementsV = 160000; + std::vector modesT{'m', 'n'}; // input + std::vector modesU{'m', 'm'}; + std::vector modesV{'n', 'n'}; // SVD output size_t sizeA = sizeof(A); size_t sizeU = sizeof(U); @@ -787,10 +781,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & const int32_t numModesU = modesU.size(); const int32_t numModesV = modesV.size(); - std::vector extentT{400, 400}; // shape of T - std::vector extentU{400, 400}; // shape of U - std::vector extentS{400}; // shape of S - std::vector extentV{400, 400}; // shape of V + std::vector extentT{m, n}; // shape of T + std::vector extentU{m, m}; // shape of U + std::vector extentS{n}; // shape of S + std::vector extentV{n, n}; // shape of V const int64_t* strides = NULL; // assuming fortran layout for all tensors From 517a5548541468e5a81889f1d8392097b9119a1e Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 06:21:25 +0000 Subject: [PATCH 05/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 2b73212f1f..1c60d708b7 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -742,11 +742,11 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) ); typedef float floatType; - cudaDataType_t typeData = CUDA_R_32F; + cudaDataType_t typeData = CUDA_C_64F; - std::vector modesT{'m', 'n'}; // input - std::vector modesU{'m', 'm'}; - std::vector modesV{'n', 'n'}; // SVD output + std::vector modesT{'a', 'b'}; // input + std::vector modesU{'c', 'd'}; + std::vector modesV{'e', 'c'}; // SVD output size_t sizeA = sizeof(A); size_t sizeU = sizeof(U); From 7e40588b4b8f21dbff6876702a6d7721609118aa Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 08:23:35 +0000 Subject: [PATCH 06/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 1c60d708b7..4be447ea37 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -782,15 +782,18 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & const int32_t numModesV = modesV.size(); std::vector extentT{m, n}; // shape of T - std::vector extentU{m, m}; // shape of U - std::vector extentS{n}; // shape of S - std::vector extentV{n, n}; // shape of V + std::vector extentU{lda, lda}; // shape of U + std::vector extentS{lda}; // shape of S + std::vector extentV{min_dim, min_dim}; // shape of V - const int64_t* strides = NULL; // assuming fortran layout for all tensors + std::vector stridesT{n, 1}; + std::vector stridesU{lda, 1}; + std::vector stridesV{min_dim, 1}; - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) ); + + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), stridesT.data(), modesT.data(), typeData, &descTensorIn) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) ); cutensornetTensorSVDConfig_t svdConfig; HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) ); @@ -889,8 +892,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t & // We here restore descTensorU/V to the original problem. HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) ); HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) ); + HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) ); HANDLE_ERROR( cutensornetTensorSVD(handle, descTensorIn, D_T, From ebf9ca09e1ae4a9b3e704199705609e1cc17e7f9 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 09:32:44 +0000 Subject: [PATCH 07/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 478 ++++++++++---------- 1 file changed, 237 insertions(+), 241 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 4be447ea37..6c7d432df5 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -29,15 +29,13 @@ namespace AER { - #ifdef AER_THRUST_CUDA -#include -#include +#include +#include +#include #include #include -#include - #define HANDLE_ERROR(x) \ { \ @@ -60,14 +58,8 @@ namespace AER { } \ }; - #endif // AER_THRUST_CUDA - - - - - // default values constexpr auto mul_factor = 1e2; constexpr long double tiny_factor = 1e30; @@ -593,22 +585,22 @@ void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V, bool lapack) { if (lapack) { - #ifdef AER_THRUST_CUDA - cutensor_csvd_wrapper(A, U, S, V); - #endif // AER_THRUST_CUDA - - #ifndef AER_THRUST_CUDA - lapack_csvd_wrapper(A, U, S, V); - #endif // AER_THRUST_CUDA +#ifdef AER_THRUST_CUDA + cutensor_csvd_wrapper(A, U, S, V); +#endif // AER_THRUST_CUDA + +#ifndef AER_THRUST_CUDA + lapack_csvd_wrapper(A, U, S, V); +#endif // AER_THRUST_CUDA } else { - #ifdef AER_THRUST_CUDA - cutensor_csvd_wrapper(A, U, S, V); - #endif // AER_THRUST_CUDA - - #ifndef AER_THRUST_CUDA - qiskit_csvd_wrapper(A, U, S, V); - #endif // AER_THRUST_CUDA +#ifdef AER_THRUST_CUDA + cutensor_csvd_wrapper(A, U, S, V); +#endif // AER_THRUST_CUDA + +#ifndef AER_THRUST_CUDA + qiskit_csvd_wrapper(A, U, S, V); +#endif // AER_THRUST_CUDA } } @@ -721,238 +713,242 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } } -//#ifdef AER_THRUST_CUDA -void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) -{ - const int64_t m = A.GetRows(), n = A.GetColumns(); - const int64_t min_dim = std::min(m, n); - const int64_t lda = std::max(m, n); - int64_t lwork = 2 * min_dim + lda; - - U.resize(m, m); - V.resize(n, n); - - complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer(); - - const size_t cuTensornetVersion = cutensornetGetVersion(); - - cudaDeviceProp prop; - int deviceId{-1}; - HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) ); - HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) ); - - typedef float floatType; - cudaDataType_t typeData = CUDA_C_64F; - - std::vector modesT{'a', 'b'}; // input - std::vector modesU{'c', 'd'}; - std::vector modesV{'e', 'c'}; // SVD output - - size_t sizeA = sizeof(A); - size_t sizeU = sizeof(U); - size_t sizeS = sizeof(S); - size_t sizeV = sizeof(V); - - double *cutensor_S = (double*)malloc(sizeof(S)); - - void* D_T; - void* D_U; - void* D_S; - void* D_V; - - HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeA) ); - HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) ); - HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) ); - HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) ); - - HANDLE_CUDA_ERROR( cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice) ); - - cudaStream_t stream; - HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) ); - - cutensornetHandle_t handle; - HANDLE_ERROR( cutensornetCreate(&handle) ); - - cutensornetTensorDescriptor_t descTensorIn; - cutensornetTensorDescriptor_t descTensorU; - cutensornetTensorDescriptor_t descTensorV; - - const int32_t numModesIn = modesT.size(); - const int32_t numModesU = modesU.size(); - const int32_t numModesV = modesV.size(); - - std::vector extentT{m, n}; // shape of T - std::vector extentU{lda, lda}; // shape of U - std::vector extentS{lda}; // shape of S - std::vector extentV{min_dim, min_dim}; // shape of V - - std::vector stridesT{n, 1}; - std::vector stridesU{lda, 1}; - std::vector stridesV{min_dim, 1}; - - - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), stridesT.data(), modesT.data(), typeData, &descTensorIn) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) ); - - cutensornetTensorSVDConfig_t svdConfig; - HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) ); - - // set up truncation parameters - double absCutoff = 1e-2; - HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, - svdConfig, - CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, - &absCutoff, - sizeof(absCutoff)) ); - double relCutoff = 4e-2; - HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, - svdConfig, - CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, - &relCutoff, - sizeof(relCutoff)) ); - - // optional: choose gesvdj algorithm with customized parameters. Default is gesvd. - cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; - HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, - svdConfig, - CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, - &svdAlgo, - sizeof(svdAlgo)) ); - cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80}; - HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, - svdConfig, - CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, - &gesvdjParams, - sizeof(gesvdjParams)) ); - - /******************************************************** +// #ifdef AER_THRUST_CUDA +void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, + cmatrix_t &V) { + const int64_t m = A.GetRows(), n = A.GetColumns(); + const int64_t min_dim = std::min(m, n); + const int64_t lda = std::max(m, n); + int64_t lwork = 2 * min_dim + lda; + + // Transpose when m < n + bool transposed = false; + if (m < n) { + transposed = true; + A = AER::Utils::dagger(A); + } + + complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), + *cutensor_V = V.move_to_buffer(); + + const size_t cuTensornetVersion = cutensornetGetVersion(); + + cudaDeviceProp prop; + int deviceId{-1}; + HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); + HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); + + typedef float floatType; + cudaDataType_t typeData = CUDA_C_64F; + + std::vector modesA{'a', 'b'}; // input + std::vector modesU{'c', 'd'}; + std::vector modesV{'e', 'c'}; // SVD output + + size_t sizeA = sizeof(A); + size_t sizeU = sizeof(U); + size_t sizeS = sizeof(S); + size_t sizeV = sizeof(V); + + double *cutensor_S = (double *)malloc(sizeof(S)); + + void *D_T; + void *D_U; + void *D_S; + void *D_V; + + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_T, sizeA)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV)); + + HANDLE_CUDA_ERROR(cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice)); + + cudaStream_t stream; + HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); + + cutensornetHandle_t handle; + HANDLE_ERROR(cutensornetCreate(&handle)); + + cutensornetTensorDescriptor_t descTensorIn; + cutensornetTensorDescriptor_t descTensorU; + cutensornetTensorDescriptor_t descTensorV; + + const int32_t numModesA = modesA.size(); + const int32_t numModesU = modesU.size(); + const int32_t numModesV = modesV.size(); + + std::vector extentA{lda, min_dim}; // shape of A + std::vector extentU{lda, lda}; // shape of U + std::vector extentS{min_dim}; // shape of S + std::vector extentV{min_dim, min_dim}; // shape of V + + std::vector stridesA{n, 1}; + std::vector stridesU{lda, 1}; + std::vector stridesV{min_dim, 1}; + + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesA, extentA.data(), stridesA.data(), modesA.data(), + typeData, &descTensorIn)); + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), + typeData, &descTensorU)); + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), + typeData, &descTensorV)); + + cutensornetTensorSVDConfig_t svdConfig; + HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig)); + + // set up truncation parameters + double absCutoff = 1e-2; + HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( + handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff, + sizeof(absCutoff))); + double relCutoff = 4e-2; + HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( + handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff, + sizeof(relCutoff))); + + // optional: choose gesvdj algorithm with customized parameters. Default is + // gesvd. + cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( + handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, &svdAlgo, + sizeof(svdAlgo))); + cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80}; + HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( + handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, + &gesvdjParams, sizeof(gesvdjParams))); + + /******************************************************** * Create SVDInfo to record runtime SVD truncation details *********************************************************/ - cutensornetTensorSVDInfo_t svdInfo; - HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ; + cutensornetTensorSVDInfo_t svdInfo; + HANDLE_ERROR(cutensornetCreateTensorSVDInfo(handle, &svdInfo)); - /************************************************************** + /************************************************************** * Query the required workspace sizes and allocate memory **************************************************************/ - cutensornetWorkspaceDescriptor_t workDesc; - HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) ); - HANDLE_ERROR( cutensornetWorkspaceComputeSVDSizes(handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc) ); - int64_t hostWorkspaceSize, deviceWorkspaceSize; - // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick - HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle, - workDesc, - CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, - CUTENSORNET_MEMSPACE_DEVICE, - CUTENSORNET_WORKSPACE_SCRATCH, - &deviceWorkspaceSize) ); - HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle, - workDesc, - CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, - CUTENSORNET_MEMSPACE_HOST, - CUTENSORNET_WORKSPACE_SCRATCH, - &hostWorkspaceSize) ); - - void *devWork = nullptr, *hostWork = nullptr; - if (deviceWorkspaceSize > 0) { - HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) ); - } - if (hostWorkspaceSize > 0) { - hostWork = malloc(hostWorkspaceSize); - } - HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle, - workDesc, - CUTENSORNET_MEMSPACE_DEVICE, - CUTENSORNET_WORKSPACE_SCRATCH, - devWork, - deviceWorkspaceSize) ); - HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle, - workDesc, - CUTENSORNET_MEMSPACE_HOST, - CUTENSORNET_WORKSPACE_SCRATCH, - hostWork, - hostWorkspaceSize) ); - - /********** + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc)); + HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes( + handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc)); + int64_t hostWorkspaceSize, deviceWorkspaceSize; + // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick + HANDLE_ERROR(cutensornetWorkspaceGetMemorySize( + handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH, + &deviceWorkspaceSize)); + HANDLE_ERROR(cutensornetWorkspaceGetMemorySize( + handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH, + &hostWorkspaceSize)); + + void *devWork = nullptr, *hostWork = nullptr; + if (deviceWorkspaceSize > 0) { + HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize)); + } + if (hostWorkspaceSize > 0) { + hostWork = malloc(hostWorkspaceSize); + } + HANDLE_ERROR(cutensornetWorkspaceSetMemory( + handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize)); + HANDLE_ERROR(cutensornetWorkspaceSetMemory( + handle, workDesc, CUTENSORNET_MEMSPACE_HOST, + CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize)); + + /********** * Execution ***********/ - const int numRuns = 3; // to get stable perf results - for (int i=0; i < numRuns; ++i) - { - // restore output - cudaMemsetAsync(D_U, 0, sizeU, stream); - cudaMemsetAsync(D_S, 0, sizeS, stream); - cudaMemsetAsync(D_V, 0, sizeV, stream); - cudaDeviceSynchronize(); - - // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V. - // We here restore descTensorU/V to the original problem. - HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) ); - HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU) ); - HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV) ); - - HANDLE_ERROR( cutensornetTensorSVD(handle, - descTensorIn, D_T, - descTensorU, D_U, - D_S, - descTensorV, D_V, - svdConfig, - svdInfo, - workDesc, - stream) ); - // Synchronize and measure timing - } - - - HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost) ); - HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost) ); - HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost) ); - - S.clear(); - for (int i = 0; i < min_dim; i++) - S.push_back(cutensor_S[i]); - - A = cmatrix_t::move_from_buffer(m, n, cutensor_A); - U = cmatrix_t::move_from_buffer(m, m, cutensor_U); - V = cmatrix_t::move_from_buffer(n, n, cutensor_V); - - /************************************* + const int numRuns = 3; // to get stable perf results + for (int i = 0; i < numRuns; ++i) { + // restore output + cudaMemsetAsync(D_U, 0, sizeU, stream); + cudaMemsetAsync(D_S, 0, sizeS, stream); + cudaMemsetAsync(D_V, 0, sizeV, stream); + cudaDeviceSynchronize(); + + // With value-based truncation, `cutensornetTensorSVD` can potentially + // update the shared extent in descTensorU/V. We here restore descTensorU/V + // to the original problem. + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), + typeData, &descTensorU)); + HANDLE_ERROR(cutensornetCreateTensorDescriptor( + handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), + typeData, &descTensorV)); + + HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorIn, D_T, descTensorU, + D_U, D_S, descTensorV, D_V, svdConfig, + svdInfo, workDesc, stream)); + // Synchronize and measure timing + } + + HANDLE_CUDA_ERROR( + cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost)); + HANDLE_CUDA_ERROR( + cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost)); + HANDLE_CUDA_ERROR( + cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost)); + + S.clear(); + for (int i = 0; i < min_dim; i++) + S.push_back(cutensor_S[i]); + + A = cmatrix_t::move_from_buffer(m, n, cutensor_A); + U = cmatrix_t::move_from_buffer(m, m, cutensor_U); + V = cmatrix_t::move_from_buffer(n, n, cutensor_V); + + /************************************* * Query runtime truncation information **************************************/ - double discardedWeight{0}; - int64_t reducedExtent{0}; - cutensornetGesvdjStatus_t gesvdjStatus; - cudaDeviceSynchronize(); // device synchronization. - HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, &discardedWeight, sizeof(discardedWeight)) ); - HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, &reducedExtent, sizeof(reducedExtent)) ); - HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, sizeof(gesvdjStatus)) ); - - /*************** + double discardedWeight{0}; + int64_t reducedExtent{0}; + cutensornetGesvdjStatus_t gesvdjStatus; + cudaDeviceSynchronize(); // device synchronization. + HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute( + handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, + &discardedWeight, sizeof(discardedWeight))); + HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute( + handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, + &reducedExtent, sizeof(reducedExtent))); + HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute( + handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, + sizeof(gesvdjStatus))); + + /*************** * Free resources ****************/ - HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) ); - HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) ); - HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) ); - HANDLE_ERROR( cutensornetDestroyTensorSVDConfig(svdConfig) ); - HANDLE_ERROR( cutensornetDestroyTensorSVDInfo(svdInfo) ); - HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) ); - HANDLE_ERROR( cutensornetDestroy(handle) ); - - if (D_T) cudaFree(D_T); - if (D_U) cudaFree(D_U); - if (D_S) cudaFree(D_S); - if (D_V) cudaFree(D_V); - if (devWork) cudaFree(devWork); - if (hostWork) free(hostWork); - + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorIn)); + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); + HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig)); + HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo)); + HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + HANDLE_ERROR(cutensornetDestroy(handle)); + + if (D_T) + cudaFree(D_T); + if (D_U) + cudaFree(D_U); + if (D_S) + cudaFree(D_S); + if (D_V) + cudaFree(D_V); + if (devWork) + cudaFree(devWork); + if (hostWork) + free(hostWork); } -//#endif // AER_THRUST_CUDA - +// #endif // AER_THRUST_CUDA } // namespace AER From a42269042f199a7e8b1ffa65e3b0ecfb0b57a28f Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 09:40:47 +0000 Subject: [PATCH 08/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 6c7d432df5..dcc515b0c2 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -770,7 +770,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cutensornetHandle_t handle; HANDLE_ERROR(cutensornetCreate(&handle)); - cutensornetTensorDescriptor_t descTensorIn; + cutensornetTensorDescriptor_t descTensorA; cutensornetTensorDescriptor_t descTensorU; cutensornetTensorDescriptor_t descTensorV; @@ -778,7 +778,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, const int32_t numModesU = modesU.size(); const int32_t numModesV = modesV.size(); - std::vector extentA{lda, min_dim}; // shape of A + std::vector extentA{lda, min_dim}; // shape of A std::vector extentU{lda, lda}; // shape of U std::vector extentS{min_dim}; // shape of S std::vector extentV{min_dim, min_dim}; // shape of V @@ -789,7 +789,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetCreateTensorDescriptor( handle, numModesA, extentA.data(), stridesA.data(), modesA.data(), - typeData, &descTensorIn)); + typeData, &descTensorA)); HANDLE_ERROR(cutensornetCreateTensorDescriptor( handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), typeData, &descTensorU)); @@ -835,7 +835,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cutensornetWorkspaceDescriptor_t workDesc; HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc)); HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes( - handle, descTensorIn, descTensorU, descTensorV, svdConfig, workDesc)); + handle, descTensorA, descTensorU, descTensorV, svdConfig, workDesc)); int64_t hostWorkspaceSize, deviceWorkspaceSize; // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick HANDLE_ERROR(cutensornetWorkspaceGetMemorySize( @@ -885,7 +885,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), typeData, &descTensorV)); - HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorIn, D_T, descTensorU, + HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_T, descTensorU, D_U, D_S, descTensorV, D_V, svdConfig, svdInfo, workDesc, stream)); // Synchronize and measure timing @@ -902,9 +902,9 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, for (int i = 0; i < min_dim; i++) S.push_back(cutensor_S[i]); - A = cmatrix_t::move_from_buffer(m, n, cutensor_A); - U = cmatrix_t::move_from_buffer(m, m, cutensor_U); - V = cmatrix_t::move_from_buffer(n, n, cutensor_V); + A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A); + U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U); + V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); /************************************* * Query runtime truncation information @@ -928,7 +928,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, * Free resources ****************/ - HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorIn)); + HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig)); From 80b59d508c587c75811856fb9a12f4ba8ad6527d Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 6 Jun 2024 09:59:45 +0000 Subject: [PATCH 09/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index dcc515b0c2..7e4aa586ea 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -713,7 +713,7 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } } -// #ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_CUDA void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { const int64_t m = A.GetRows(), n = A.GetColumns(); @@ -906,6 +906,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U); V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); + validate_SVD_result(A, U, S, V); + /************************************* * Query runtime truncation information **************************************/ @@ -949,6 +951,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, if (hostWork) free(hostWork); } -// #endif // AER_THRUST_CUDA +#endif // AER_THRUST_CUDA } // namespace AER From 52f1ed4f30a5736311b19ca0e18fe2da68662e47 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Fri, 7 Jun 2024 08:47:05 +0000 Subject: [PATCH 10/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 7e4aa586ea..b96f33c2c0 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -738,19 +738,19 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); - typedef float floatType; + typedef double floatType; cudaDataType_t typeData = CUDA_C_64F; - std::vector modesA{'a', 'b'}; // input - std::vector modesU{'c', 'd'}; - std::vector modesV{'e', 'c'}; // SVD output + std::vector modesA{'m', 'n'}; // input + std::vector modesU{'n', 'x'}; + std::vector modesV{'x', 'n'}; // SVD output size_t sizeA = sizeof(A); size_t sizeU = sizeof(U); size_t sizeS = sizeof(S); size_t sizeV = sizeof(V); - double *cutensor_S = (double *)malloc(sizeof(S)); + floatType *cutensor_S = (floatType *)malloc(sizeof(S)); void *D_T; void *D_U; @@ -779,12 +779,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, const int32_t numModesV = modesV.size(); std::vector extentA{lda, min_dim}; // shape of A - std::vector extentU{lda, lda}; // shape of U + std::vector extentU{min_dim, min_dim}; // shape of U std::vector extentS{min_dim}; // shape of S std::vector extentV{min_dim, min_dim}; // shape of V std::vector stridesA{n, 1}; - std::vector stridesU{lda, 1}; + std::vector stridesU{min_dim, 1}; std::vector stridesV{min_dim, 1}; HANDLE_ERROR(cutensornetCreateTensorDescriptor( @@ -888,7 +888,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_T, descTensorU, D_U, D_S, descTensorV, D_V, svdConfig, svdInfo, workDesc, stream)); - // Synchronize and measure timing } HANDLE_CUDA_ERROR( From ed43e7155d500906ba1b576dd8151529f00c059f Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Sun, 9 Jun 2024 07:06:48 +0530 Subject: [PATCH 11/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 65 ++++++++++----------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index b96f33c2c0..94fa5ab349 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -29,37 +29,6 @@ namespace AER { -#ifdef AER_THRUST_CUDA - -#include -#include -#include -#include -#include - -#define HANDLE_ERROR(x) \ - { \ - const auto err = x; \ - if (err != CUTENSORNET_STATUS_SUCCESS) { \ - std::stringstream str; \ - str << "ERROR TensorNet::contractor : " \ - << cutensornetGetErrorString(err); \ - throw std::runtime_error(str.str()); \ - } \ - }; - -#define HANDLE_CUDA_ERROR(x) \ - { \ - const auto err = x; \ - if (err != cudaSuccess) { \ - std::stringstream str; \ - str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \ - throw std::runtime_error(str.str()); \ - } \ - }; - -#endif // AER_THRUST_CUDA - // default values constexpr auto mul_factor = 1e2; constexpr long double tiny_factor = 1e30; @@ -714,6 +683,35 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } #ifdef AER_THRUST_CUDA + +#include +#include +#include +#include +#include + +#define HANDLE_ERROR(x) \ + { \ + const auto err = x; \ + if (err != CUTENSORNET_STATUS_SUCCESS) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " \ + << cutensornetGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +#define HANDLE_CUDA_ERROR(x) \ + { \ + const auto err = x; \ + if (err != cudaSuccess) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +namespace TensorNetwork { void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { const int64_t m = A.GetRows(), n = A.GetColumns(); @@ -750,7 +748,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, size_t sizeS = sizeof(S); size_t sizeV = sizeof(V); - floatType *cutensor_S = (floatType *)malloc(sizeof(S)); + std::complex *cutensor_S = (std::complex *)malloc(sizeof(S)); void *D_T; void *D_U; @@ -899,7 +897,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, S.clear(); for (int i = 0; i < min_dim; i++) - S.push_back(cutensor_S[i]); + S.push_back(cutensor_S[i].real()); A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A); U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U); @@ -950,6 +948,7 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, if (hostWork) free(hostWork); } +} // namespace TensorNetwork #endif // AER_THRUST_CUDA } // namespace AER From 649a5d73b91618df515e6df0ff96f724072607d3 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Sun, 9 Jun 2024 13:32:15 +0530 Subject: [PATCH 12/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 68 ++++++++------------- src/simulators/matrix_product_state/svd.hpp | 5 +- 2 files changed, 30 insertions(+), 43 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 94fa5ab349..2901e75d91 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -684,7 +684,6 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, #ifdef AER_THRUST_CUDA -#include #include #include #include @@ -717,7 +716,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, const int64_t m = A.GetRows(), n = A.GetColumns(); const int64_t min_dim = std::min(m, n); const int64_t lda = std::max(m, n); - int64_t lwork = 2 * min_dim + lda; // Transpose when m < n bool transposed = false; @@ -748,19 +746,19 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, size_t sizeS = sizeof(S); size_t sizeV = sizeof(V); - std::complex *cutensor_S = (std::complex *)malloc(sizeof(S)); + complex_t *cutensor_S = (complex_t *)malloc(sizeof(S)); - void *D_T; + void *D_A; void *D_U; void *D_S; void *D_V; - HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_T, sizeA)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA)); HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU)); HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS)); HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV)); - HANDLE_CUDA_ERROR(cudaMemcpy(D_T, cutensor_A, sizeA, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice)); cudaStream_t stream; HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); @@ -781,29 +779,28 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, std::vector extentS{min_dim}; // shape of S std::vector extentV{min_dim, min_dim}; // shape of V - std::vector stridesA{n, 1}; - std::vector stridesU{min_dim, 1}; - std::vector stridesV{min_dim, 1}; + const int64_t *strides = + NULL; // matrices stores the entries in column-major-order. HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesA, extentA.data(), stridesA.data(), modesA.data(), - typeData, &descTensorA)); + handle, numModesA, extentA.data(), strides, modesA.data(), typeData, + &descTensorA)); HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), - typeData, &descTensorU)); + handle, numModesU, extentU.data(), strides, modesU.data(), typeData, + &descTensorU)); HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), - typeData, &descTensorV)); + handle, numModesV, extentV.data(), strides, modesV.data(), typeData, + &descTensorV)); cutensornetTensorSVDConfig_t svdConfig; HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig)); // set up truncation parameters - double absCutoff = 1e-2; + double absCutoff = 1e-13; HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff, sizeof(absCutoff))); - double relCutoff = 4e-2; + double relCutoff = 4e-13; HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff, sizeof(relCutoff))); @@ -877,13 +874,13 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesU, extentU.data(), stridesU.data(), modesU.data(), - typeData, &descTensorU)); + handle, numModesU, extentU.data(), strides, modesU.data(), typeData, + &descTensorU)); HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesV, extentV.data(), stridesV.data(), modesV.data(), - typeData, &descTensorV)); + handle, numModesV, extentV.data(), strides, modesV.data(), typeData, + &descTensorV)); - HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_T, descTensorU, + HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U, D_S, descTensorV, D_V, svdConfig, svdInfo, workDesc, stream)); } @@ -903,25 +900,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U); V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); - validate_SVD_result(A, U, S, V); + if (transposed) { + transposed = false; + A = AER::Utils::dagger(A); + } - /************************************* - * Query runtime truncation information - **************************************/ - - double discardedWeight{0}; - int64_t reducedExtent{0}; - cutensornetGesvdjStatus_t gesvdjStatus; - cudaDeviceSynchronize(); // device synchronization. - HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute( - handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, - &discardedWeight, sizeof(discardedWeight))); - HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute( - handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, - &reducedExtent, sizeof(reducedExtent))); - HANDLE_ERROR(cutensornetTensorSVDInfoGetAttribute( - handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, - sizeof(gesvdjStatus))); + validate_SVD_result(A, U, S, V); /*************** * Free resources @@ -935,8 +919,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); HANDLE_ERROR(cutensornetDestroy(handle)); - if (D_T) - cudaFree(D_T); + if (D_A) + cudaFree(D_A); if (D_U) cudaFree(D_U); if (D_S) diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index c35a9c4406..0f0715b480 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -49,9 +49,12 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, // Lapack call void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); + +#ifdef AER_THRUST_CUDA // cutensor call void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, - cmatrix_t &V); + cmatrix_t &V); +#endif // AER_THRUST_CUDA void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U, const rvector_t &S, const cmatrix_t &V); From c33571fd51c7abf07271330d729956cc454bd7dd Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 11 Jun 2024 14:59:43 +0000 Subject: [PATCH 13/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 51 +++++++++------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index b022caa12a..2a5418dd51 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -710,19 +710,13 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } \ }; -namespace TensorNetwork { void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { - const int64_t m = A.GetRows(), n = A.GetColumns(); - const int64_t min_dim = std::min(m, n); - const int64_t lda = std::max(m, n); + cmatrix_t A_cpy = A; - // Transpose when m < n - bool transposed = false; - if (m < n) { - transposed = true; - A = AER::Utils::dagger(A); - } + const int64_t m = A.GetRows(), n = A.GetColumns(); + U.resize(n, n); + V.resize(n, n); complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer(); @@ -741,12 +735,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, std::vector modesU{'n', 'x'}; std::vector modesV{'x', 'n'}; // SVD output - size_t sizeA = sizeof(A); - size_t sizeU = sizeof(U); - size_t sizeS = sizeof(S); - size_t sizeV = sizeof(V); + size_t sizeA = A.size() * sizeof(cmatrix_t); + size_t sizeU = U.size() * sizeof(cmatrix_t); + size_t sizeS = S.size() * sizeof(complex_t); + size_t sizeV = V.size() * sizeof(cmatrix_t); - complex_t *cutensor_S = (complex_t *)malloc(sizeof(S)); + complex_t *cutensor_S = (complex_t *)malloc(sizeS); void *D_A; void *D_U; @@ -774,10 +768,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, const int32_t numModesU = modesU.size(); const int32_t numModesV = modesV.size(); - std::vector extentA{lda, min_dim}; // shape of A - std::vector extentU{min_dim, min_dim}; // shape of U - std::vector extentS{min_dim}; // shape of S - std::vector extentV{min_dim, min_dim}; // shape of V + std::vector extentA{m, n}; // shape of A + std::vector extentU{n, n}; // shape of U :) + std::vector extentS{n}; // shape of S + std::vector extentV{n, n}; // shape of V const int64_t *strides = NULL; // matrices stores the entries in column-major-order. @@ -893,19 +887,14 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost)); S.clear(); - for (int i = 0; i < min_dim; i++) + for (int i = 0; i < n; i++) S.push_back(cutensor_S[i].real()); - A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A); - U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U); - V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); - - if (transposed) { - transposed = false; - A = AER::Utils::dagger(A); - } + A = cmatrix_t::move_from_buffer(m, n, cutensor_A); + U = cmatrix_t::move_from_buffer(n, n, cutensor_U); + V = cmatrix_t::move_from_buffer(n, n, cutensor_V); - validate_SVD_result(A, U, S, V); + validate_SVD_result(A_cpy, U, S, V); /*************** * Free resources @@ -917,8 +906,11 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig)); HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo)); HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); HANDLE_ERROR(cutensornetDestroy(handle)); + if (cutensor_S) + free(cutensor_S); if (D_A) cudaFree(D_A); if (D_U) @@ -932,7 +924,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, if (hostWork) free(hostWork); } -} // namespace TensorNetwork #endif // AER_THRUST_CUDA } // namespace AER From f0205e37f281e0f68da784685f0b5fed865458a5 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Fri, 14 Jun 2024 14:02:08 +0000 Subject: [PATCH 14/27] refactor code --- .../matrix_product_state.hpp | 3 +++ .../matrix_product_state_internal.cpp | 10 +++++--- .../matrix_product_state_internal.hpp | 8 +++++-- .../matrix_product_state_tensor.hpp | 6 ++--- src/simulators/matrix_product_state/svd.cpp | 24 ++++++------------- src/simulators/matrix_product_state/svd.hpp | 2 +- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp index 1cfeb9b43d..676f3b9425 100644 --- a/src/simulators/matrix_product_state/matrix_product_state.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state.hpp @@ -373,6 +373,9 @@ void State::set_config(const Config &config) { // Set LAPACK SVD MPS::set_mps_lapack_svd(config.mps_lapack); + + // Set device for SVD + MPS::set_mps_svd_device(config.device); } void State::add_metadata(ExperimentResult &result) const { diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index e6cacd0239..e02b3fec88 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -19,6 +19,7 @@ #include "stdlib.h" #include "string.h" #include +#include #include #include "framework/linalg/almost_equal.hpp" @@ -45,6 +46,7 @@ double MPS::json_chop_threshold_ = 1E-8; std::stringstream MPS::logging_str_; bool MPS::mps_log_data_ = 0; bool MPS::mps_lapack_ = false; +std::string MPS::mps_svd_device_; //------------------------------------------------------------------------ // local function declarations @@ -663,8 +665,9 @@ void MPS::common_apply_2_qubit_gate( MPS_Tensor left_gamma, right_gamma; rvector_t lambda; - double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda, - right_gamma, MPS::mps_lapack_); + double discarded_value = + MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma, + MPS::mps_lapack_, MPS::mps_svd_device_); if (discarded_value > json_chop_threshold_) MPS::print_to_log("discarded_value=", discarded_value, ", "); @@ -1803,7 +1806,8 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { // step 2 - SVD S.clear(); S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); - csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); + csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_, + MPS::mps_svd_device_); reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(), MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_); diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 1180e6cddf..04eb08e7fc 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -15,12 +15,12 @@ #ifndef _aer_matrix_product_state_hpp_ #define _aer_matrix_product_state_hpp_ -#include - #include "framework/json.hpp" #include "framework/operations.hpp" #include "framework/utils.hpp" #include "matrix_product_state_tensor.hpp" +#include +#include namespace AER { namespace MatrixProductState { @@ -321,6 +321,9 @@ class MPS { } static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; } + static void set_mps_svd_device(std::string mps_svd_device) { + mps_svd_device_ = mps_svd_device; + } static uint_t get_omp_threads() { return omp_threads_; } static uint_t get_omp_threshold() { return omp_threshold_; } @@ -570,6 +573,7 @@ class MPS { static bool mps_log_data_; static MPS_swap_direction mps_swap_direction_; static bool mps_lapack_; + static std::string mps_svd_device_; }; inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) { diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index b769e8a59f..ed9d0bf1be 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -159,7 +159,7 @@ class MPS_Tensor { const MPS_Tensor &right_gamma, bool mul_by_lambda); static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack); + bool mps_lapack, std::string mps_svd_device); static void reshape_for_3_qubits_before_SVD(const std::vector data, MPS_Tensor &reshaped_tensor); static void contract_2_dimensions(const MPS_Tensor &left_gamma, @@ -592,13 +592,13 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma, //--------------------------------------------------------------- double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack) { + bool mps_lapack, std::string mps_svd_device) { cmatrix_t C; C = reshape_before_SVD(temp.data_); cmatrix_t U, V; rvector_t S(std::min(C.GetRows(), C.GetColumns())); - csvd_wrapper(C, U, S, V, mps_lapack); + csvd_wrapper(C, U, S, V, mps_lapack, mps_svd_device); double discarded_value = 0.0; discarded_value = reduce_zeros(U, S, V, max_bond_dimension_, truncation_threshold_, mps_lapack); diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 2a5418dd51..30d102459f 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -551,25 +551,15 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { } void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V, - bool lapack) { - if (lapack) { - -#ifdef AER_THRUST_CUDA + bool lapack, std::string mps_svd_device) { + if (mps_svd_device.compare("GPU") == 0) { cutensor_csvd_wrapper(A, U, S, V); -#endif // AER_THRUST_CUDA - -#ifndef AER_THRUST_CUDA - lapack_csvd_wrapper(A, U, S, V); -#endif // AER_THRUST_CUDA } else { - -#ifdef AER_THRUST_CUDA - cutensor_csvd_wrapper(A, U, S, V); -#endif // AER_THRUST_CUDA - -#ifndef AER_THRUST_CUDA - qiskit_csvd_wrapper(A, U, S, V); -#endif // AER_THRUST_CUDA + if (lapack) { + lapack_csvd_wrapper(A, U, S, V); + } else { + qiskit_csvd_wrapper(A, U, S, V); + } } } diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index 0f0715b480..9c44cfb83b 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -42,7 +42,7 @@ double reduce_zeros(cmatrix_t &U, rvector_t &S, cmatrix_t &V, status csvd(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); // Entry point for the SVD calculation void csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V, - bool lapack); + bool lapack, std::string mps_svd_device); // Original qiskit call void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); From 629f65f5c0262c7571dab724076a45525f212cc7 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Sat, 15 Jun 2024 12:25:34 +0000 Subject: [PATCH 15/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 133 +++++++------------- 1 file changed, 44 insertions(+), 89 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 30d102459f..0c8d26d7c9 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -553,7 +553,10 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V, bool lapack, std::string mps_svd_device) { if (mps_svd_device.compare("GPU") == 0) { +#ifdef AER_THRUST_CUDA cutensor_csvd_wrapper(A, U, S, V); +#endif // AER_THRUST_CUDA + } else { if (lapack) { lapack_csvd_wrapper(A, U, S, V); @@ -702,11 +705,28 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { + + bool transposed = false; + + const int64_t rows = A.GetRows(), cols = A.GetColumns(); + + if (rows < cols) { + transposed = true; + A = AER::Utils::dagger(A); + } cmatrix_t A_cpy = A; - const int64_t m = A.GetRows(), n = A.GetColumns(); - U.resize(n, n); - V.resize(n, n); + const int64_t min_dim = std::min(rows, cols); + const int64_t lda = std::max(rows, cols); + + U.resize(lda, min_dim); + V.resize(min_dim, min_dim); + S.resize(min_dim); + + size_t sizeA = A.size() * sizeof(complex_t); + size_t sizeU = U.size() * sizeof(complex_t); + size_t sizeS = S.size() * sizeof(double); + size_t sizeV = V.size() * sizeof(complex_t); complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer(); @@ -718,19 +738,13 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); - typedef double floatType; cudaDataType_t typeData = CUDA_C_64F; - std::vector modesA{'m', 'n'}; // input - std::vector modesU{'n', 'x'}; - std::vector modesV{'x', 'n'}; // SVD output - - size_t sizeA = A.size() * sizeof(cmatrix_t); - size_t sizeU = U.size() * sizeof(cmatrix_t); - size_t sizeS = S.size() * sizeof(complex_t); - size_t sizeV = V.size() * sizeof(cmatrix_t); + std::vector modesA{'m', 'n'}; + std::vector modesU{'m', 'x'}; + std::vector modesV{'x', 'n'}; - complex_t *cutensor_S = (complex_t *)malloc(sizeS); + double *cutensor_S = (double *)malloc(sizeS); void *D_A; void *D_U; @@ -758,10 +772,9 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, const int32_t numModesU = modesU.size(); const int32_t numModesV = modesV.size(); - std::vector extentA{m, n}; // shape of A - std::vector extentU{n, n}; // shape of U :) - std::vector extentS{n}; // shape of S - std::vector extentV{n, n}; // shape of V + std::vector extentA{lda, min_dim}; // shape of A + std::vector extentU{lda, min_dim}; // shape of U :) + std::vector extentV{min_dim, min_dim}; // shape of V const int64_t *strides = NULL; // matrices stores the entries in column-major-order. @@ -776,45 +789,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV)); - cutensornetTensorSVDConfig_t svdConfig; - HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig)); - - // set up truncation parameters - double absCutoff = 1e-13; - HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( - handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff, - sizeof(absCutoff))); - double relCutoff = 4e-13; - HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( - handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff, - sizeof(relCutoff))); - - // optional: choose gesvdj algorithm with customized parameters. Default is - // gesvd. - cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; - HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( - handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, &svdAlgo, - sizeof(svdAlgo))); - cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80}; - HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute( - handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, - &gesvdjParams, sizeof(gesvdjParams))); - - /******************************************************** - * Create SVDInfo to record runtime SVD truncation details - *********************************************************/ - - cutensornetTensorSVDInfo_t svdInfo; - HANDLE_ERROR(cutensornetCreateTensorSVDInfo(handle, &svdInfo)); - - /************************************************************** - * Query the required workspace sizes and allocate memory - **************************************************************/ - cutensornetWorkspaceDescriptor_t workDesc; HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc)); HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes( - handle, descTensorA, descTensorU, descTensorV, svdConfig, workDesc)); + handle, descTensorA, descTensorU, descTensorV, NULL, workDesc)); int64_t hostWorkspaceSize, deviceWorkspaceSize; // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick HANDLE_ERROR(cutensornetWorkspaceGetMemorySize( @@ -840,34 +818,10 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, handle, workDesc, CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize)); - /********** - * Execution - ***********/ - - const int numRuns = 3; // to get stable perf results - for (int i = 0; i < numRuns; ++i) { - // restore output - cudaMemsetAsync(D_U, 0, sizeU, stream); - cudaMemsetAsync(D_S, 0, sizeS, stream); - cudaMemsetAsync(D_V, 0, sizeV, stream); - cudaDeviceSynchronize(); - - // With value-based truncation, `cutensornetTensorSVD` can potentially - // update the shared extent in descTensorU/V. We here restore descTensorU/V - // to the original problem. - HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); - HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); - HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesU, extentU.data(), strides, modesU.data(), typeData, - &descTensorU)); - HANDLE_ERROR(cutensornetCreateTensorDescriptor( - handle, numModesV, extentV.data(), strides, modesV.data(), typeData, - &descTensorV)); - - HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, - D_U, D_S, descTensorV, D_V, svdConfig, - svdInfo, workDesc, stream)); - } + // Requesting for Exact SVD. + HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U, + D_S, descTensorV, D_V, NULL, NULL, workDesc, + stream)); HANDLE_CUDA_ERROR( cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost)); @@ -877,14 +831,17 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost)); S.clear(); - for (int i = 0; i < n; i++) - S.push_back(cutensor_S[i].real()); + for (int i = 0; i < min_dim; i++) + S.push_back(cutensor_S[i]); - A = cmatrix_t::move_from_buffer(m, n, cutensor_A); - U = cmatrix_t::move_from_buffer(n, n, cutensor_U); - V = cmatrix_t::move_from_buffer(n, n, cutensor_V); + A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A); + U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U); + V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); - validate_SVD_result(A_cpy, U, S, V); + validate_SVdD_result(A_cpy, U, S, V); + if (transposed) { + std::swap(U, V); + } /*************** * Free resources @@ -893,8 +850,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); - HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig)); - HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo)); HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); HANDLE_ERROR(cutensornetDestroy(handle)); From 644a822ac86eb388d9f1b8d34a6424d9338deb4b Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Sun, 16 Jun 2024 23:59:24 +0000 Subject: [PATCH 16/27] added release note --- ...s-svd-with-cuquantum-c0392854d1f373e0.yaml | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml new file mode 100644 index 0000000000..ad5d08cf6e --- /dev/null +++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml @@ -0,0 +1,40 @@ +--- +features: + - | + This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs. + To be precise, this PR offloads the Singular Value Decomposition required for + Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum. + + While choosing for the backend for Matrix Product State simulation users can choose all as usual, but + this time they can choose the device as GPU. + + Example + + .. code-block:: python + + from qiskit_aer import AerSimulator + from qiskit.circuit import QuantumCircuit + from qiskit.compiler import transpile + + num_qubits = 10 + shots = 5 + + qc = QuantumCircuit(num_qubits) + qc.h(0) + + for control, target in zip(range(num_qubits-1), range(1, num_qubits)): + qc.cx(control, target) + + qc.measure_all() + + sim = AerSimulator(method="matrix_product_state", device="GPU") + qc_t = transpile(qc, backend=sim) + job = sim.run(qc_t, shots = shots) + + counts = job.result().get_counts() + counts + + + + + From e6f228820c0cf3f1327ef1c8a291082bd20e01f0 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Mon, 17 Jun 2024 11:42:36 +0000 Subject: [PATCH 17/27] refactor code --- ...s-svd-with-cuquantum-c0392854d1f373e0.yaml | 4 +- .../matrix_product_state.hpp | 5 ++ .../matrix_product_state_internal.cpp | 20 +++++-- .../matrix_product_state_internal.hpp | 24 +++++++- .../matrix_product_state_tensor.hpp | 14 ++++- src/simulators/matrix_product_state/svd.cpp | 60 ++----------------- src/simulators/matrix_product_state/svd.hpp | 43 ++++++++++--- 7 files changed, 98 insertions(+), 72 deletions(-) diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml index ad5d08cf6e..8e065ad484 100644 --- a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml +++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml @@ -5,8 +5,8 @@ features: To be precise, this PR offloads the Singular Value Decomposition required for Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum. - While choosing for the backend for Matrix Product State simulation users can choose all as usual, but - this time they can choose the device as GPU. + While choosing for the backend for Matrix Product State simulation users can + choose all as usual, but this time they can choose the device as GPU. Example diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp index 676f3b9425..2f4b332395 100644 --- a/src/simulators/matrix_product_state/matrix_product_state.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state.hpp @@ -376,6 +376,11 @@ void State::set_config(const Config &config) { // Set device for SVD MPS::set_mps_svd_device(config.device); + + // Get CUDA device, if GPU offloading enabled + if (config.device.compare("GPU") == 0) { + MPS::set_cuda_device(); + } } void State::add_metadata(ExperimentResult &result) const { diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index e02b3fec88..c301121960 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -48,6 +48,10 @@ bool MPS::mps_log_data_ = 0; bool MPS::mps_lapack_ = false; std::string MPS::mps_svd_device_; +// if cuda is present. +cutensornetHandle_t MPS::cuda_handle_ = NULL; +cudaStream_t MPS::cuda_stream_ = NULL; + //------------------------------------------------------------------------ // local function declarations //------------------------------------------------------------------------ @@ -665,9 +669,9 @@ void MPS::common_apply_2_qubit_gate( MPS_Tensor left_gamma, right_gamma; rvector_t lambda; - double discarded_value = - MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma, - MPS::mps_lapack_, MPS::mps_svd_device_); + double discarded_value = MPS_Tensor::Decompose( + temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, + MPS::mps_svd_device_, MPS::cuda_stream_, MPS::cuda_handle_); if (discarded_value > json_chop_threshold_) MPS::print_to_log("discarded_value=", discarded_value, ", "); @@ -1806,8 +1810,14 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { // step 2 - SVD S.clear(); S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); - csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_, - MPS::mps_svd_device_); + + if (MPS::mps_svd_device_.compare("GPU") == 0) { + cutensor_csvd_wrapper(reshaped_matrix, U, S, V, MPS::cuda_stream_, + MPS::cuda_handle_); + } else { + csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); + } + reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(), MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_); diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 04eb08e7fc..71e067dcc5 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -82,7 +82,15 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT }; class MPS { public: MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {} - ~MPS() {} + ~MPS() { + if (cuda_stream_) { + HANDLE_CUDA_ERROR(cudaStreamDestroy(cuda_stream_)); + } + + if (cuda_handle_) { + HANDLE_ERROR(cutensornetDestroy(cuda_handle_)); + } + } //-------------------------------------------------------------------------- // Function name: initialize @@ -325,6 +333,18 @@ class MPS { mps_svd_device_ = mps_svd_device; } + static void set_cuda_device() { + // the prop could be used to log the properties of the device. + + cudaDeviceProp prop; + int deviceId{-1}; + HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); + HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); + + HANDLE_CUDA_ERROR(cudaStreamCreate(&MPS::cuda_stream_)); + HANDLE_ERROR(cutensornetCreate(&MPS::cuda_handle_)); + } + static uint_t get_omp_threads() { return omp_threads_; } static uint_t get_omp_threshold() { return omp_threshold_; } static double get_json_chop_threshold() { return json_chop_threshold_; } @@ -574,6 +594,8 @@ class MPS { static MPS_swap_direction mps_swap_direction_; static bool mps_lapack_; static std::string mps_svd_device_; + static cutensornetHandle_t cuda_handle_; + static cudaStream_t cuda_stream_; }; inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) { diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index ed9d0bf1be..b57e7e0198 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -159,7 +159,9 @@ class MPS_Tensor { const MPS_Tensor &right_gamma, bool mul_by_lambda); static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device); + bool mps_lapack, std::string mps_svd_device, + cudaStream_t cuda_stream, + cutensornetHandle_t cuda_handle); static void reshape_for_3_qubits_before_SVD(const std::vector data, MPS_Tensor &reshaped_tensor); static void contract_2_dimensions(const MPS_Tensor &left_gamma, @@ -592,13 +594,19 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma, //--------------------------------------------------------------- double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device) { + bool mps_lapack, std::string mps_svd_device, + cudaStream_t cuda_stream, + cutensornetHandle_t cuda_handle) { cmatrix_t C; C = reshape_before_SVD(temp.data_); cmatrix_t U, V; rvector_t S(std::min(C.GetRows(), C.GetColumns())); - csvd_wrapper(C, U, S, V, mps_lapack, mps_svd_device); + if (mps_svd_device.compare("GPU") == 0) { + cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cuda_handle); + } else { + csvd_wrapper(C, U, S, V, mps_lapack); + } double discarded_value = 0.0; discarded_value = reduce_zeros(U, S, V, max_bond_dimension_, truncation_threshold_, mps_lapack); diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 0c8d26d7c9..820d8b29a2 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -551,18 +551,11 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) { } void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V, - bool lapack, std::string mps_svd_device) { - if (mps_svd_device.compare("GPU") == 0) { -#ifdef AER_THRUST_CUDA - cutensor_csvd_wrapper(A, U, S, V); -#endif // AER_THRUST_CUDA - + bool lapack) { + if (lapack) { + lapack_csvd_wrapper(A, U, S, V); } else { - if (lapack) { - lapack_csvd_wrapper(A, U, S, V); - } else { - qiskit_csvd_wrapper(A, U, S, V); - } + qiskit_csvd_wrapper(A, U, S, V); } } @@ -676,35 +669,9 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, } #ifdef AER_THRUST_CUDA - -#include -#include -#include -#include - -#define HANDLE_ERROR(x) \ - { \ - const auto err = x; \ - if (err != CUTENSORNET_STATUS_SUCCESS) { \ - std::stringstream str; \ - str << "ERROR TensorNet::contractor : " \ - << cutensornetGetErrorString(err); \ - throw std::runtime_error(str.str()); \ - } \ - }; - -#define HANDLE_CUDA_ERROR(x) \ - { \ - const auto err = x; \ - if (err != cudaSuccess) { \ - std::stringstream str; \ - str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \ - throw std::runtime_error(str.str()); \ - } \ - }; - void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, - cmatrix_t &V) { + cmatrix_t &V, cudaStream_t &stream, + cutensornetHandle_t &handle) { bool transposed = false; @@ -731,13 +698,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(), *cutensor_V = V.move_to_buffer(); - const size_t cuTensornetVersion = cutensornetGetVersion(); - - cudaDeviceProp prop; - int deviceId{-1}; - HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); - HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); - cudaDataType_t typeData = CUDA_C_64F; std::vector modesA{'m', 'n'}; @@ -758,12 +718,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice)); - cudaStream_t stream; - HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); - - cutensornetHandle_t handle; - HANDLE_ERROR(cutensornetCreate(&handle)); - cutensornetTensorDescriptor_t descTensorA; cutensornetTensorDescriptor_t descTensorU; cutensornetTensorDescriptor_t descTensorV; @@ -851,8 +805,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); - HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); - HANDLE_ERROR(cutensornetDestroy(handle)); if (cutensor_S) free(cutensor_S); diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index 9c44cfb83b..83cf769148 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -42,7 +42,7 @@ double reduce_zeros(cmatrix_t &U, rvector_t &S, cmatrix_t &V, status csvd(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); // Entry point for the SVD calculation void csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V, - bool lapack, std::string mps_svd_device); + bool lapack); // Original qiskit call void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); @@ -50,17 +50,46 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, cmatrix_t &V); -#ifdef AER_THRUST_CUDA -// cutensor call -void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, - cmatrix_t &V); -#endif // AER_THRUST_CUDA - void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U, const rvector_t &S, const cmatrix_t &V); void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U, const rvector_t &S, const cmatrix_t &V); +#ifdef AER_THRUST_CUDA + +#include +#include +#include +#include + +#define HANDLE_ERROR(x) \ + { \ + const auto err = x; \ + if (err != CUTENSORNET_STATUS_SUCCESS) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " \ + << cutensornetGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +#define HANDLE_CUDA_ERROR(x) \ + { \ + const auto err = x; \ + if (err != cudaSuccess) { \ + std::stringstream str; \ + str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \ + throw std::runtime_error(str.str()); \ + } \ + }; + +// cutensor call +void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, + cmatrix_t &V, cudaStream_t &stream, + cutensornetHandle_t &handle); + +#endif // AER_THRUST_CUDA + //------------------------------------------------------------------------- } // end namespace AER //------------------------------------------------------------------------- From c24b9e2365ba281cd09c044862f3526b3817bac6 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 18 Jun 2024 01:18:28 +0000 Subject: [PATCH 18/27] refactor code --- .../matrix_product_state_internal.cpp | 13 ++++--------- .../matrix_product_state_internal.hpp | 15 +-------------- .../matrix_product_state_tensor.hpp | 10 +++------- src/simulators/matrix_product_state/svd.cpp | 8 ++++++++ src/simulators/matrix_product_state/svd.hpp | 3 +-- 5 files changed, 17 insertions(+), 32 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index c301121960..9df0e3220c 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -48,10 +48,6 @@ bool MPS::mps_log_data_ = 0; bool MPS::mps_lapack_ = false; std::string MPS::mps_svd_device_; -// if cuda is present. -cutensornetHandle_t MPS::cuda_handle_ = NULL; -cudaStream_t MPS::cuda_stream_ = NULL; - //------------------------------------------------------------------------ // local function declarations //------------------------------------------------------------------------ @@ -669,9 +665,9 @@ void MPS::common_apply_2_qubit_gate( MPS_Tensor left_gamma, right_gamma; rvector_t lambda; - double discarded_value = MPS_Tensor::Decompose( - temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, - MPS::mps_svd_device_, MPS::cuda_stream_, MPS::cuda_handle_); + double discarded_value = + MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma, + MPS::mps_lapack_, MPS::mps_svd_device_); if (discarded_value > json_chop_threshold_) MPS::print_to_log("discarded_value=", discarded_value, ", "); @@ -1812,8 +1808,7 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); if (MPS::mps_svd_device_.compare("GPU") == 0) { - cutensor_csvd_wrapper(reshaped_matrix, U, S, V, MPS::cuda_stream_, - MPS::cuda_handle_); + cutensor_csvd_wrapper(reshaped_matrix, U, S, V); } else { csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); } diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 71e067dcc5..b3d1433b19 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -82,15 +82,7 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT }; class MPS { public: MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {} - ~MPS() { - if (cuda_stream_) { - HANDLE_CUDA_ERROR(cudaStreamDestroy(cuda_stream_)); - } - - if (cuda_handle_) { - HANDLE_ERROR(cutensornetDestroy(cuda_handle_)); - } - } + ~MPS() {} //-------------------------------------------------------------------------- // Function name: initialize @@ -340,9 +332,6 @@ class MPS { int deviceId{-1}; HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); - - HANDLE_CUDA_ERROR(cudaStreamCreate(&MPS::cuda_stream_)); - HANDLE_ERROR(cutensornetCreate(&MPS::cuda_handle_)); } static uint_t get_omp_threads() { return omp_threads_; } @@ -594,8 +583,6 @@ class MPS { static MPS_swap_direction mps_swap_direction_; static bool mps_lapack_; static std::string mps_svd_device_; - static cutensornetHandle_t cuda_handle_; - static cudaStream_t cuda_stream_; }; inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) { diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index b57e7e0198..d8337f9aa6 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -159,9 +159,7 @@ class MPS_Tensor { const MPS_Tensor &right_gamma, bool mul_by_lambda); static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device, - cudaStream_t cuda_stream, - cutensornetHandle_t cuda_handle); + bool mps_lapack, std::string mps_svd_device); static void reshape_for_3_qubits_before_SVD(const std::vector data, MPS_Tensor &reshaped_tensor); static void contract_2_dimensions(const MPS_Tensor &left_gamma, @@ -594,16 +592,14 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma, //--------------------------------------------------------------- double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device, - cudaStream_t cuda_stream, - cutensornetHandle_t cuda_handle) { + bool mps_lapack, std::string mps_svd_device) { cmatrix_t C; C = reshape_before_SVD(temp.data_); cmatrix_t U, V; rvector_t S(std::min(C.GetRows(), C.GetColumns())); if (mps_svd_device.compare("GPU") == 0) { - cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cuda_handle); + cutensor_csvd_wrapper(C, U, S, V); } else { csvd_wrapper(C, U, S, V, mps_lapack); } diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 820d8b29a2..36bad34d54 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -704,6 +704,12 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, std::vector modesU{'m', 'x'}; std::vector modesV{'x', 'n'}; + cudaStream_t stream; + HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); + + cutensornetHandle_t handle; + HANDLE_ERROR(cutensornetCreate(&handle)); + double *cutensor_S = (double *)malloc(sizeS); void *D_A; @@ -805,6 +811,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); + HANDLE_ERROR(cutensornetDestroy(handle)); if (cutensor_S) free(cutensor_S); diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index 83cf769148..d116c89348 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -85,8 +85,7 @@ void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U, // cutensor call void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, - cmatrix_t &V, cudaStream_t &stream, - cutensornetHandle_t &handle); + cmatrix_t &V); #endif // AER_THRUST_CUDA From 34e9502f8275302a8e2eabb663b6c098c6a17e0b Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 18 Jun 2024 02:21:50 +0000 Subject: [PATCH 19/27] refactor code --- src/simulators/matrix_product_state/svd.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 36bad34d54..33ad8a2c57 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -670,8 +670,7 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, #ifdef AER_THRUST_CUDA void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, - cmatrix_t &V, cudaStream_t &stream, - cutensornetHandle_t &handle) { + cmatrix_t &V) { bool transposed = false; From 00f88e980fa5dabb8b0df88d944c1a549e6254cb Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 18 Jun 2024 04:32:03 +0000 Subject: [PATCH 20/27] refactor code; included test --- .../matrix_product_state/matrix_product_state_internal.cpp | 2 ++ .../matrix_product_state/matrix_product_state_internal.hpp | 2 ++ .../matrix_product_state/matrix_product_state_tensor.hpp | 2 ++ src/simulators/matrix_product_state/svd.cpp | 3 ++- test/terra/backends/simulator_test_case.py | 2 +- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index 9df0e3220c..813f0236e9 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -1808,7 +1808,9 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); if (MPS::mps_svd_device_.compare("GPU") == 0) { +#ifdef AER_THRUST_CUDA cutensor_csvd_wrapper(reshaped_matrix, U, S, V); +#endif // AER_THRUST_CUDA } else { csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); } diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index b3d1433b19..7050eb44f8 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -328,10 +328,12 @@ class MPS { static void set_cuda_device() { // the prop could be used to log the properties of the device. +#ifdef AER_THRUST_CUDA cudaDeviceProp prop; int deviceId{-1}; HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); +#endif // AER_THRUST_CUDA } static uint_t get_omp_threads() { return omp_threads_; } diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index d8337f9aa6..41b78cafee 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -599,7 +599,9 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t S(std::min(C.GetRows(), C.GetColumns())); if (mps_svd_device.compare("GPU") == 0) { +#ifdef AER_THRUST_CUDA cutensor_csvd_wrapper(C, U, S, V); +#endif // AER_THRUST_CUDA } else { csvd_wrapper(C, U, S, V, mps_lapack); } diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 33ad8a2c57..981506d351 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -797,7 +797,8 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U); V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V); - validate_SVdD_result(A_cpy, U, S, V); + V = AER::Utils::dagger(V); + validate_SVD_result(A_cpy, U, S, V); if (transposed) { std::swap(U, V); } diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 2173c2c413..1362b8a861 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -85,7 +85,7 @@ def _method_device(methods): # add special test device for cuStateVec if available cuStateVec = check_cuStateVec(available_devices) - gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network"] + gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network", "matrix_product_state"] batchable_methods = ["statevector", "density_matrix"] data_args = [] for method in methods: From 454f8c0075f329c46f2e3cf6542b64410d014c6b Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 18 Jun 2024 04:35:32 +0000 Subject: [PATCH 21/27] lint --- test/terra/backends/simulator_test_case.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 1362b8a861..6c38c623c6 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -85,7 +85,13 @@ def _method_device(methods): # add special test device for cuStateVec if available cuStateVec = check_cuStateVec(available_devices) - gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network", "matrix_product_state"] + gpu_methods = [ + "statevector", + "density_matrix", + "unitary", + "tensor_network", + "matrix_product_state", + ] batchable_methods = ["statevector", "density_matrix"] data_args = [] for method in methods: From 985c7f20935399ffa8a643731a36e1654f9b8343 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 18 Jun 2024 09:08:30 +0000 Subject: [PATCH 22/27] added suggestion --- test/terra/backends/simulator_test_case.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 6c38c623c6..358fe942f7 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -111,7 +111,11 @@ def _method_device(methods): # add test cases for cuStateVec if available using special device = 'GPU_cuStateVec' #'GPU_cuStateVec' is used only inside tests not available in Aer # and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added - if cuStateVec and "tensor_network" != method: + if ( + cuStateVec + and "tensor_network" != method + and "matrix_product_state" != method + ): data_args.append((method, "GPU_cuStateVec")) else: data_args.append((method, "CPU")) From 34a5e75f105fb7ece277cec982ff95bbba560be0 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Sat, 31 Aug 2024 13:48:32 +0000 Subject: [PATCH 23/27] fixed a typo --- .../matrix_product_state/matrix_product_state_tensor.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index 5b4a77e78c..256392e7df 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -160,9 +160,10 @@ class MPS_Tensor { static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, bool mps_lapack, std::string mps_svd_device); - - static void reshape_for_3_qubits_before_SVD(const std::vector data, - MPS_Tensor &reshaped_tensor); + + static void + reshape_for_3_qubits_before_SVD(const std::vector &data, + MPS_Tensor &reshaped_tensor); static void contract_2_dimensions(const MPS_Tensor &left_gamma, const MPS_Tensor &right_gamma, From a1ae3080fafaa1655a35aa1bf44303eaacd29306 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 10 Sep 2024 23:06:26 +0000 Subject: [PATCH 24/27] refactor code --- .../matrix_product_state.hpp | 7 ++++- .../matrix_product_state_internal.cpp | 22 ++++++++++---- .../matrix_product_state_internal.hpp | 25 ++++++++-------- .../matrix_product_state_tensor.hpp | 30 +++++++++++++++---- src/simulators/matrix_product_state/svd.cpp | 11 ++----- src/simulators/matrix_product_state/svd.hpp | 3 +- 6 files changed, 64 insertions(+), 34 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp index 0346b01f84..ae4f3bddd5 100644 --- a/src/simulators/matrix_product_state/matrix_product_state.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state.hpp @@ -366,7 +366,12 @@ void State::set_config(const Config &config) { // Get CUDA device, if GPU offloading enabled if (config.device.compare("GPU") == 0) { - MPS::set_cuda_device(); +#ifdef AER_THRUST_CUDA + cudaDeviceProp prop; + int deviceId{-1}; + HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); + HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); +#endif // AER_THRUST_CUDA } } diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index a7d5a121ec..c48a4d4c24 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -46,7 +46,9 @@ double MPS::json_chop_threshold_ = 1E-8; std::stringstream MPS::logging_str_; bool MPS::mps_log_data_ = 0; bool MPS::mps_lapack_ = false; +#ifdef AER_THRUST_CUDA std::string MPS::mps_svd_device_; +#endif // AER_THRUST_CUDA //------------------------------------------------------------------------ // local function declarations @@ -665,9 +667,14 @@ void MPS::common_apply_2_qubit_gate( MPS_Tensor left_gamma, right_gamma; rvector_t lambda; - double discarded_value = - MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma, - MPS::mps_lapack_, MPS::mps_svd_device_); +#ifdef AER_THRUST_CUDA + double discarded_value = MPS_Tensor::Decompose( + temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, + MPS::mps_svd_device_, cuda_stream, cutensor_handle); +#else + double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda, + right_gamma, MPS::mps_lapack_); +#endif // AER_THRUST_CUDA if (discarded_value > json_chop_threshold_) MPS::print_to_log("discarded_value=", discarded_value, ", "); @@ -1807,13 +1814,16 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { S.clear(); S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); - if (MPS::mps_svd_device_.compare("GPU") == 0) { #ifdef AER_THRUST_CUDA - cutensor_csvd_wrapper(reshaped_matrix, U, S, V); -#endif // AER_THRUST_CUDA + if (MPS::mps_svd_device_.compare("GPU") == 0) { + cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream, + cutensor_handle); } else { csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); } +#else + csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_); +#endif // AER_THRUST_CUDA reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(), MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_); diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 7050eb44f8..599604fa8b 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -81,7 +81,14 @@ enum class MPS_swap_direction { SWAP_LEFT, SWAP_RIGHT }; class MPS { public: - MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) {} + MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) { +#ifdef AER_THRUST_CUDA + if (mps_svd_device_.compare("GPU") == 0) { + cudaStreamCreate(&cuda_stream); + cutensornetCreate(&cutensor_handle); + } +#endif // AER_THRUST_CUDA + } ~MPS() {} //-------------------------------------------------------------------------- @@ -325,17 +332,6 @@ class MPS { mps_svd_device_ = mps_svd_device; } - static void set_cuda_device() { - // the prop could be used to log the properties of the device. - -#ifdef AER_THRUST_CUDA - cudaDeviceProp prop; - int deviceId{-1}; - HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); - HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); -#endif // AER_THRUST_CUDA - } - static uint_t get_omp_threads() { return omp_threads_; } static uint_t get_omp_threshold() { return omp_threshold_; } static double get_json_chop_threshold() { return json_chop_threshold_; } @@ -558,6 +554,11 @@ class MPS { std::vector q_reg_; std::vector lambda_reg_; +#ifdef AER_THRUST_CUDA + cudaStream_t cuda_stream; + cutensornetHandle_t cutensor_handle; +#endif // AER_THRUST_CUDA + struct ordering { // order_ stores the current ordering of the qubits, // location_ stores the location of each qubit in the vector. It is derived diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index 256392e7df..2886333b34 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -157,9 +157,17 @@ class MPS_Tensor { static MPS_Tensor contract(const MPS_Tensor &left_gamma, const rvector_t &lambda, const MPS_Tensor &right_gamma, bool mul_by_lambda); +#ifdef AER_THRUST_CUDA + static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, + rvector_t &lambda, MPS_Tensor &right_gamma, + bool mps_lapack, std::string mps_svd_device, + cudaStream_t &cuda_stream, + cutensornetHandle_t &cutensor_handle); +#else static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device); + bool mps_lapack); +#endif // AER_THRUST_CUDA static void reshape_for_3_qubits_before_SVD(const std::vector &data, @@ -593,21 +601,33 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma, // rvector_t &lambda - tensors for the result. // Returns: none. //--------------------------------------------------------------- +#ifdef AER_THRUST_CUDA +double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, + rvector_t &lambda, MPS_Tensor &right_gamma, + bool mps_lapack, std::string mps_svd_device, + cudaStream_t &cuda_stream, + cutensornetHandle_t &cutensor_handle) +#else double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device) { + bool mps_lapack) +#endif // AER_THRUST_CUDA +{ cmatrix_t C; C = reshape_before_SVD(temp.data_); cmatrix_t U, V; rvector_t S(std::min(C.GetRows(), C.GetColumns())); - if (mps_svd_device.compare("GPU") == 0) { #ifdef AER_THRUST_CUDA - cutensor_csvd_wrapper(C, U, S, V); -#endif // AER_THRUST_CUDA + if (mps_svd_device.compare("GPU") == 0) { + cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle); } else { csvd_wrapper(C, U, S, V, mps_lapack); } +#else + csvd_wrapper(C, U, S, V, mps_lapack); +#endif // AER_THRUST_CUDA + double discarded_value = 0.0; discarded_value = reduce_zeros(U, S, V, max_bond_dimension_, truncation_threshold_, mps_lapack); diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 8ba5f45af6..38c74b16a0 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -670,7 +670,8 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, #ifdef AER_THRUST_CUDA void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, - cmatrix_t &V) { + cmatrix_t &V, cudaStream_t &stream, + cutensornetHandle_t &handle) { bool transposed = false; @@ -703,12 +704,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, std::vector modesU{'m', 'x'}; std::vector modesV{'x', 'n'}; - cudaStream_t stream; - HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); - - cutensornetHandle_t handle; - HANDLE_ERROR(cutensornetCreate(&handle)); - double *cutensor_S = (double *)malloc(sizeS); void *D_A; @@ -811,8 +806,6 @@ void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU)); HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV)); HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); - HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); - HANDLE_ERROR(cutensornetDestroy(handle)); if (cutensor_S) free(cutensor_S); diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp index 9594d597f1..69e2518462 100644 --- a/src/simulators/matrix_product_state/svd.hpp +++ b/src/simulators/matrix_product_state/svd.hpp @@ -85,7 +85,8 @@ void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U, // cutensor call void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S, - cmatrix_t &V); + cmatrix_t &V, cudaStream_t &stream, + cutensornetHandle_t &handle); #endif // AER_THRUST_CUDA From ed9a907c6c34688564cbaec1f0a96c3c0f99dfe8 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Mon, 11 Nov 2024 11:30:20 +0000 Subject: [PATCH 25/27] refactor code --- .../matrix_product_state/matrix_product_state.hpp | 10 +--------- .../matrix_product_state_internal.hpp | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp index ae4f3bddd5..9de6b405ae 100644 --- a/src/simulators/matrix_product_state/matrix_product_state.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state.hpp @@ -362,17 +362,9 @@ void State::set_config(const Config &config) { MPS::set_mps_lapack_svd(config.mps_lapack); // Set device for SVD - MPS::set_mps_svd_device(config.device); - - // Get CUDA device, if GPU offloading enabled - if (config.device.compare("GPU") == 0) { #ifdef AER_THRUST_CUDA - cudaDeviceProp prop; - int deviceId{-1}; - HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId)); - HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId)); + MPS::set_mps_svd_device(config.device); #endif // AER_THRUST_CUDA - } } void State::add_metadata(ExperimentResult &result) const { diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 599604fa8b..548fe31611 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -83,13 +83,22 @@ class MPS { public: MPS(uint_t num_qubits = 0) : num_qubits_(num_qubits) { #ifdef AER_THRUST_CUDA + cuda_stream = NULL; + cutensor_handle = NULL; if (mps_svd_device_.compare("GPU") == 0) { cudaStreamCreate(&cuda_stream); cutensornetCreate(&cutensor_handle); } #endif // AER_THRUST_CUDA } - ~MPS() {} + ~MPS() { +#ifdef AER_THRUST_CUDA + if (cutensor_handle) + cutensornetDestroy(cutensor_handle); + if (cuda_stream) + cudaStreamDestroy(cuda_stream); +#endif // AER_THRUST_CUDA + } //-------------------------------------------------------------------------- // Function name: initialize @@ -328,9 +337,11 @@ class MPS { } static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; } +#ifdef AER_THRUST_CUDA static void set_mps_svd_device(std::string mps_svd_device) { mps_svd_device_ = mps_svd_device; } +#endif // AER_THRUST_CUDA static uint_t get_omp_threads() { return omp_threads_; } static uint_t get_omp_threshold() { return omp_threshold_; } @@ -585,7 +596,9 @@ class MPS { static bool mps_log_data_; static MPS_swap_direction mps_swap_direction_; static bool mps_lapack_; +#ifdef AER_THRUST_CUDA static std::string mps_svd_device_; +#endif // AER_THRUST_CUDA }; inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) { From e1e80d1effad2d86eea052c5b9324e9eeae8a0e9 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Thu, 28 Nov 2024 06:43:03 +0000 Subject: [PATCH 26/27] added cublas for contract --- .../matrix_product_state.hpp | 2 +- .../matrix_product_state_internal.cpp | 37 +++- .../matrix_product_state_internal.hpp | 14 +- .../matrix_product_state_tensor.hpp | 167 +++++++++++++++++- src/simulators/matrix_product_state/svd.cpp | 1 - 5 files changed, 202 insertions(+), 19 deletions(-) diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp index 9de6b405ae..1e4aafc1aa 100644 --- a/src/simulators/matrix_product_state/matrix_product_state.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state.hpp @@ -363,7 +363,7 @@ void State::set_config(const Config &config) { // Set device for SVD #ifdef AER_THRUST_CUDA - MPS::set_mps_svd_device(config.device); + MPS::set_mps_device(config.device); #endif // AER_THRUST_CUDA } diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index c48a4d4c24..8df59d349c 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -47,7 +47,7 @@ std::stringstream MPS::logging_str_; bool MPS::mps_log_data_ = 0; bool MPS::mps_lapack_ = false; #ifdef AER_THRUST_CUDA -std::string MPS::mps_svd_device_; +std::string MPS::mps_device_; #endif // AER_THRUST_CUDA //------------------------------------------------------------------------ @@ -631,8 +631,14 @@ void MPS::common_apply_2_qubit_gate( if (A + 1 != num_qubits_ - 1) q_reg_[A + 1].mul_Gamma_by_right_Lambda(lambda_reg_[A + 1]); +#ifdef AER_THRUST_CUDA + MPS_Tensor temp = + MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1], + MPS::mps_device_, num_qubits_, cublas_handle); +#else MPS_Tensor temp = MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1]); +#endif // AER_THRUST_CUDA switch (gate_type) { case cx: @@ -669,8 +675,8 @@ void MPS::common_apply_2_qubit_gate( rvector_t lambda; #ifdef AER_THRUST_CUDA double discarded_value = MPS_Tensor::Decompose( - temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, - MPS::mps_svd_device_, cuda_stream, cutensor_handle); + temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, MPS::mps_device_, + num_qubits_, cuda_stream, cutensor_handle); #else double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_); @@ -1301,6 +1307,7 @@ MPS_Tensor MPS::state_vec_as_MPS(const reg_t &qubits) { } MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const { + MPS_Tensor temp = q_reg_[first_index]; if (first_index != 0) @@ -1313,8 +1320,14 @@ MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const { } for (uint_t i = first_index + 1; i < last_index + 1; i++) { +#ifdef AER_THRUST_CUDA + temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i], + MPS::mps_device_, num_qubits_, cublas_handle); +#else temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i]); +#endif // AER_THRUST_CUDA } + // now temp is a tensor of 2^n matrices if (last_index != num_qubits_ - 1) temp.mul_Gamma_by_right_Lambda(lambda_reg_[last_index]); @@ -1815,7 +1828,23 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns())); #ifdef AER_THRUST_CUDA - if (MPS::mps_svd_device_.compare("GPU") == 0) { + // Before offloading to the GPU, we need to make sure the size of + // matrices to be operated on should be big enough to make + // the time taken by the initialization of `cutensornet` worth it. + // Size of matrices involved for qubits < 13 is too small to be + // considered to offload to the GPU. + // Even if num_qubits involved in the operation is > 13, still + // a lot of the matrices are too small to be offloaded, so to determine + // the size of matrices to offload, we can look at the number of elements + // it has. The number of elements `8401` and num_qubits > 13 has been + // obtained not on any theoritical basis, but on trial and error. + // The number of elements and number of qubits depends solely on + // difference between the speed of CPU and GPU involved. Even if matrices + // are big, they are not big enough to make speed of PICe a bottleneck. + // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`. + if ((num_qubits_ > 13) && (MPS::mps_device_.compare("GPU") == 0) && + ((reshaped_matrix.GetRows() * reshaped_matrix.GetColumns()) > 8401)) { + cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream, cutensor_handle); } else { diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp index 548fe31611..5275fc2df7 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp @@ -85,9 +85,12 @@ class MPS { #ifdef AER_THRUST_CUDA cuda_stream = NULL; cutensor_handle = NULL; - if (mps_svd_device_.compare("GPU") == 0) { + cublas_handle = NULL; + if (mps_device_.compare("GPU") == 0) { cudaStreamCreate(&cuda_stream); cutensornetCreate(&cutensor_handle); + cublasCreate(&cublas_handle); + cublasSetStream(cublas_handle, cuda_stream); } #endif // AER_THRUST_CUDA } @@ -95,6 +98,8 @@ class MPS { #ifdef AER_THRUST_CUDA if (cutensor_handle) cutensornetDestroy(cutensor_handle); + if (cublas_handle) + cublasDestroy(cublas_handle); if (cuda_stream) cudaStreamDestroy(cuda_stream); #endif // AER_THRUST_CUDA @@ -338,8 +343,8 @@ class MPS { static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; } #ifdef AER_THRUST_CUDA - static void set_mps_svd_device(std::string mps_svd_device) { - mps_svd_device_ = mps_svd_device; + static void set_mps_device(std::string mps_device) { + mps_device_ = mps_device; } #endif // AER_THRUST_CUDA @@ -568,6 +573,7 @@ class MPS { #ifdef AER_THRUST_CUDA cudaStream_t cuda_stream; cutensornetHandle_t cutensor_handle; + cublasHandle_t cublas_handle; #endif // AER_THRUST_CUDA struct ordering { @@ -597,7 +603,7 @@ class MPS { static MPS_swap_direction mps_swap_direction_; static bool mps_lapack_; #ifdef AER_THRUST_CUDA - static std::string mps_svd_device_; + static std::string mps_device_; #endif // AER_THRUST_CUDA }; diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index 2886333b34..31aae8e347 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -15,6 +15,19 @@ #ifndef _tensor_tensor_hpp_ #define _tensor_tensor_hpp_ +#ifdef AER_THRUST_CUDA +#include +#define HANDLE_CUBLAS_ERROR(x) \ + { \ + const auto err = x; \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + printf("Error: %s in line %d/n", cublasGetStatusString(err), __LINE__); \ + fflush(stdout); \ + } \ + }; + +#endif // AER_THRUST_CUDA + #include #include #include @@ -154,19 +167,33 @@ class MPS_Tensor { void mul_Gamma_by_right_Lambda(const rvector_t &Lambda); void div_Gamma_by_left_Lambda(const rvector_t &Lambda); void div_Gamma_by_right_Lambda(const rvector_t &Lambda); - static MPS_Tensor contract(const MPS_Tensor &left_gamma, - const rvector_t &lambda, - const MPS_Tensor &right_gamma, bool mul_by_lambda); + #ifdef AER_THRUST_CUDA static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device, - cudaStream_t &cuda_stream, + bool mps_lapack, std::string mps_device, + uint_t num_qubits, cudaStream_t &cuda_stream, cutensornetHandle_t &cutensor_handle); + + static MPS_Tensor contract(const MPS_Tensor &left_gamma, + const rvector_t &lambda, + const MPS_Tensor &right_gamma, + std::string mps_device, uint_t num_qubits, + const cublasHandle_t &cublas_handle, + bool mul_by_lambda); + + static void cublas_Zgemm_(cmatrix_t &A, cmatrix_t &B, cmatrix_t &C, + const cublasHandle_t &cublas_handle); + #else static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, bool mps_lapack); + + static MPS_Tensor contract(const MPS_Tensor &left_gamma, + const rvector_t &lambda, + const MPS_Tensor &right_gamma, bool mul_by_lambda); + #endif // AER_THRUST_CUDA static void @@ -503,6 +530,65 @@ void MPS_Tensor::mul_Gamma_by_Lambda(const rvector_t &Lambda, // tensors to contract. // Returns: The result tensor of the contract //--------------------------------------------------------------- +#ifdef AER_THRUST_CUDA +MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma, + const rvector_t &lambda, + const MPS_Tensor &right_gamma, + std::string mps_device, uint_t num_qubits, + const cublasHandle_t &cublas_handle, + bool mul_by_lambda = true) { + MPS_Tensor Res; + MPS_Tensor new_left = left_gamma; + if (mul_by_lambda) { + new_left.mul_Gamma_by_right_Lambda(lambda); + } + + for (uint_t idx1 = 0; idx1 < new_left.data_.size(); idx1++) { + for (uint_t idx2 = 0; idx2 < right_gamma.data_.size(); idx2++) { + + // Before offloading to the GPU, we need to make sure the size of + // matrices to be operated on should be big enough to make the time + // taken by the initialization of involved GPU routines worth it. + // Size of matrices involved for qubits < 16 is too small to be + // considered to offload to the GPU. + // Even if num_qubits involved in the operation is > 16, still + // a lot of the matrices are too small to be offloaded. + // Matrices with rows/cols > 128 when offloaded to GPU for multiplication + // show speed up when compared to the operation done on the CPU. + // This observation was obtained on trial and error, and, does not have + // any theoritical basis. + // The number of rows/cols and number of qubits depends solely on + // difference between the speed of CPU and GPU involved. Even if matrices + // are big, they are not big enough to make speed of PICe a bottleneck. + // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`. + + if ((num_qubits > 16) && (mps_device.compare("GPU") == 0)) { + cmatrix_t mat1 = new_left.data_[idx1], mat2 = right_gamma.data_[idx2]; + + uint_t mat1_rows = mat1.GetRows(), mat1_cols = mat1.GetColumns(), + mat2_rows = mat2.GetRows(), mat2_cols = mat2.GetColumns(); + + // If the mps_device is set to `GPU`, also need to make + // sure if the matrix is large enough to make it worth + // invoking needed GPU routines. + // If the matris is not big enough, the multiplication + // will be done on CPU using openblas zgemm_ routine. + if ((mat1_rows > 128) && (mat1_cols > 128) && (mat2_cols > 128)) { + cmatrix_t mat1_mat2(mat1_rows, mat2_cols); + cublas_Zgemm_(mat1, mat2, mat1_mat2, cublas_handle); + Res.data_.push_back(mat1_mat2); + } else { + Res.data_.push_back(mat1 * mat2); + } + } else { + Res.data_.push_back(new_left.data_[idx1] * right_gamma.data_[idx2]); + } + } + } + + return Res; +} +#else MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma, const rvector_t &lambda, const MPS_Tensor &right_gamma, @@ -514,11 +600,11 @@ MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma, } for (uint_t i = 0; i < new_left.data_.size(); i++) for (uint_t j = 0; j < right_gamma.data_.size(); j++) { - Res.data_.push_back(new_left.data_[i] * right_gamma.data_[j]); } return Res; } +#endif //--------------------------------------------------------------- // Function name: contract_2_dimensions @@ -604,8 +690,8 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma, #ifdef AER_THRUST_CUDA double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t &lambda, MPS_Tensor &right_gamma, - bool mps_lapack, std::string mps_svd_device, - cudaStream_t &cuda_stream, + bool mps_lapack, std::string mps_device, + uint_t num_qubits, cudaStream_t &cuda_stream, cutensornetHandle_t &cutensor_handle) #else double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, @@ -619,7 +705,22 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, rvector_t S(std::min(C.GetRows(), C.GetColumns())); #ifdef AER_THRUST_CUDA - if (mps_svd_device.compare("GPU") == 0) { + // Before offloading to the GPU, we need to make sure the size of + // matrices to be operated on should be big enough to make + // the time taken by the initialization of `cutensornet` worth it. + // Size of matrices involved for qubits < 13 is too small to be + // considered to offload to the GPU. + // Even if num_qubits involved in the operation is > 13, still + // a lot of the matrices are too small to be offloaded, so to determine + // the size of matrices to offload, we can look at the number of elements + // it has. The number of elements `8401` and num_qubits > 13 has been + // obtained not on any theoritical basis, but on trial and error. + // The number of elements and number of qubits depends solely on + // difference between the speed of CPU and GPU involved. Even if matrices + // are big, they are not big enough to make speed of PICe a bottleneck. + // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`. + if ((num_qubits > 13) && (mps_device.compare("GPU") == 0) && + ((C.GetRows() * C.GetColumns()) > 8401)) { cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle); } else { csvd_wrapper(C, U, S, V, mps_lapack); @@ -660,6 +761,54 @@ void MPS_Tensor::reshape_for_3_qubits_before_SVD( reshaped_tensor = MPS_Tensor(new_data_vector); } +#ifdef AER_THRUST_CUDA +void MPS_Tensor::cublas_Zgemm_(cmatrix_t &A, cmatrix_t &B, cmatrix_t &C, + const cublasHandle_t &cublas_handle) { + size_t A_rows = A.GetRows(), A_cols = A.GetColumns(), B_rows = B.GetRows(), + B_cols = B.GetColumns(), + size_cuDoubleComplex = sizeof(cuDoubleComplex); + + size_t size_A = A_rows * A_cols * size_cuDoubleComplex, + size_B = B_rows * B_cols * size_cuDoubleComplex, + size_C = A_rows * B_cols * size_cuDoubleComplex; + + cuDoubleComplex *A_data_dev, *B_data_dev, *C_data_dev; + + HANDLE_CUDA_ERROR(cudaMalloc((void **)&A_data_dev, size_A)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&B_data_dev, size_B)); + HANDLE_CUDA_ERROR(cudaMalloc((void **)&C_data_dev, size_C)); + + complex_t *A_data = A.move_to_buffer(), *B_data = B.move_to_buffer(); + + HANDLE_CUDA_ERROR( + cudaMemcpy(A_data_dev, A_data, size_A, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR( + cudaMemcpy(B_data_dev, B_data, size_B, cudaMemcpyHostToDevice)); + + A = std::move(cmatrix_t::move_from_buffer(A_rows, A_cols, A_data)); + B = std::move(cmatrix_t::move_from_buffer(B_rows, B_cols, B_data)); + + cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0), + beta = make_cuDoubleComplex(0.0, 0.0); + + cublasZgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, A_rows, B_cols, A_cols, + &alpha, reinterpret_cast(A_data_dev), + A_rows, reinterpret_cast(B_data_dev), + B_rows, &beta, C_data_dev, A_rows); + + complex_t *C_res = reinterpret_cast(malloc(size_C)); + HANDLE_CUDA_ERROR( + cudaMemcpy(C_res, C_data_dev, size_C, cudaMemcpyDeviceToHost)); + C = std::move(cmatrix_t::copy_from_buffer(A_rows, B_cols, C_res)); + + HANDLE_CUDA_ERROR(cudaFree(A_data_dev)); + HANDLE_CUDA_ERROR(cudaFree(B_data_dev)); + HANDLE_CUDA_ERROR(cudaFree(C_data_dev)); + if (C_res) + free(C_res); +} +#endif // AER_THRUST_CUDA + //------------------------------------------------------------------------- } // end namespace MatrixProductState //------------------------------------------------------------------------- diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp index 38c74b16a0..313fe417bf 100644 --- a/src/simulators/matrix_product_state/svd.cpp +++ b/src/simulators/matrix_product_state/svd.cpp @@ -672,7 +672,6 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V, cudaStream_t &stream, cutensornetHandle_t &handle) { - bool transposed = false; const int64_t rows = A.GetRows(), cols = A.GetColumns(); From 321230c77932e336c72fa35f8a9c72168c08c3c4 Mon Sep 17 00:00:00 2001 From: MozammilQ Date: Tue, 3 Dec 2024 08:01:12 +0000 Subject: [PATCH 27/27] fixed typo --- .../mps-svd-with-cuquantum-c0392854d1f373e0.yaml | 12 +++++++----- .../matrix_product_state_internal.cpp | 2 +- .../matrix_product_state_tensor.hpp | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml index 8e065ad484..17a7125fc9 100644 --- a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml +++ b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml @@ -2,11 +2,13 @@ features: - | This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs. - To be precise, this PR offloads the Singular Value Decomposition required for - Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum. - - While choosing for the backend for Matrix Product State simulation users can - choose all as usual, but this time they can choose the device as GPU. + + While choosing for the backend for Matrix Product State simulation users can + choose all as usual, but this time when they can choose the device as GPU, the + simulation is accelerated by NVIDIA GPU. To be precise, this PR offloads the + Singular Value Decomposition and matrix multiplication of bigger matrices involved + in Matrix Product State Simulation to GPU. + Example diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp index 8df59d349c..abb98f551a 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp +++ b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp @@ -1837,7 +1837,7 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) { // a lot of the matrices are too small to be offloaded, so to determine // the size of matrices to offload, we can look at the number of elements // it has. The number of elements `8401` and num_qubits > 13 has been - // obtained not on any theoritical basis, but on trial and error. + // obtained not on any theoretical basis, but on trial and error. // The number of elements and number of qubits depends solely on // difference between the speed of CPU and GPU involved. Even if matrices // are big, they are not big enough to make speed of PICe a bottleneck. diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp index 31aae8e347..d81026c0e2 100644 --- a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp +++ b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp @@ -556,7 +556,7 @@ MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma, // Matrices with rows/cols > 128 when offloaded to GPU for multiplication // show speed up when compared to the operation done on the CPU. // This observation was obtained on trial and error, and, does not have - // any theoritical basis. + // any theoretical basis. // The number of rows/cols and number of qubits depends solely on // difference between the speed of CPU and GPU involved. Even if matrices // are big, they are not big enough to make speed of PICe a bottleneck. @@ -714,7 +714,7 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma, // a lot of the matrices are too small to be offloaded, so to determine // the size of matrices to offload, we can look at the number of elements // it has. The number of elements `8401` and num_qubits > 13 has been - // obtained not on any theoritical basis, but on trial and error. + // obtained not on any theoretical basis, but on trial and error. // The number of elements and number of qubits depends solely on // difference between the speed of CPU and GPU involved. Even if matrices // are big, they are not big enough to make speed of PICe a bottleneck.