Unify gate memory layout for lgpu and ltensor (#959)

### Before submitting Please complete the following checklist when submitting a PR: - [ ] All new features must include a unit test. If you've fixed a bug or added code that should be tested, add a test to the [`tests`](../tests) directory! - [ ] All new functions and code must be clearly commented and documented. If you do make documentation changes, make sure that the docs build and render correctly by running `make docs`. - [ ] Ensure that the test suite passes, by running `make test`. - [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing the change, and including a link back to the PR. - [x] Ensure that code is properly formatted by running `make format`. When all the above are checked, delete everything above the dashed line and fill in the pull request template. ------------------------------------------------------------------------------------------------------------ **Context:** [sc-77874] Excitation gates' memory layout was col-major for LGPU, while they are row-major for LTensor. This PR unify memory layout for both, which will improve the maintainability of Lightning. **Description of the Change:** **Benefits:** **Possible Drawbacks:** **Related GitHub Issues:** --------- Co-authored-by: ringo-but-quantum <[email protected]>
PennyLaneAI · Nov 8, 2024 · 0419cdb · 0419cdb
1 parent f594f29
commit 0419cdb
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 224 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -9,6 +9,9 @@
 
 ### Improvements
 
+* Unify excitation gates memory layout to row-major for both LGPU and LT.
+  [(#959)](https://github.com/PennyLaneAI/pennylane-lightning/pull/959)
+
 * Update the `lightning.kokkos` CUDA backend for compatibility with Catalyst.
   [(#942)](https://github.com/PennyLaneAI/pennylane-lightning/pull/942)
 

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.40.0-dev3"
+__version__ = "0.40.0-dev4"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -399,39 +399,18 @@ class StateVectorCudaMPI final
             applyParametricPauliGate({opName}, ctrls, tgts, params.front(),
                                      adjoint);
         } else if (opName == "Rot" || opName == "CRot") {
-            if (adjoint) {
-                auto rot_matrix =
-                    cuGates::getRot<CFP_t>(params[2], params[1], params[0]);
-                applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, true);
-            } else {
-                auto rot_matrix =
-                    cuGates::getRot<CFP_t>(params[0], params[1], params[2]);
-                applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, false);
-            }
+            auto rot_matrix =
+                adjoint
+                    ? cuGates::getRot<CFP_t>(params[2], params[1], params[0])
+                    : cuGates::getRot<CFP_t>(params[0], params[1], params[2]);
+            applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, adjoint);
         } else if (opName == "Matrix") {
-            DataBuffer<CFP_t, int> d_matrix{
-                gate_matrix.size(), BaseType::getDataBuffer().getDevTag(),
-                true};
-            d_matrix.CopyHostDataToGpu(gate_matrix.data(), d_matrix.getLength(),
-                                       false);
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-            applyDeviceMatrixGate(d_matrix.getData(), ctrls_local, tgts_local,
-                                  adjoint);
+            applyDeviceMatrixGate(gate_matrix.data(), ctrls, tgts, adjoint);
         } else if (par_gates_.find(opName) != par_gates_.end()) {
             par_gates_.at(opName)(wires, adjoint, params);
         } else { // No offloadable function call; defer to matrix passing
             auto &&par =
                 (params.empty()) ? std::vector<Precision>{0.0} : params;
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-
             if (!gate_cache_.gateExists(opName, par[0]) &&
                 gate_matrix.empty()) {
                 std::string message = "Currently unsupported gate: " + opName;
@@ -440,8 +419,8 @@ class StateVectorCudaMPI final
                 gate_cache_.add_gate(opName, par[0], gate_matrix);
             }
             applyDeviceMatrixGate(
-                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local,
-                tgts_local, adjoint);
+                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls, tgts,
+                adjoint);
         }
     }
 
@@ -1826,9 +1805,8 @@ class StateVectorCudaMPI final
      * @param tgts Target qubits.
      * @param use_adjoint Use adjoint of given gate.
      */
-    void applyCuSVDeviceMatrixGate(const CFP_t *matrix,
-                                   const std::vector<int> &ctrls,
-                                   const std::vector<int> &tgts,
+    void applyCuSVDeviceMatrixGate(const CFP_t *matrix, std::vector<int> &ctrls,
+                                   std::vector<int> &tgts,
                                    bool use_adjoint = false) {
         void *extraWorkspace = nullptr;
         std::size_t extraWorkspaceSizeInBytes = 0;
@@ -1846,6 +1824,9 @@ class StateVectorCudaMPI final
             compute_type = CUSTATEVEC_COMPUTE_32F;
         }
 
+        std::reverse(tgts.begin(), tgts.end());
+        std::reverse(ctrls.begin(), ctrls.end());
+
         // check the size of external workspace
         PL_CUSTATEVEC_IS_SUCCESS(custatevecApplyMatrixGetWorkspaceSize(
             /* custatevecHandle_t */ handle_.get(),

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -314,29 +314,12 @@ class StateVectorCudaManaged
                 applyDeviceMatrixGate_(rot_matrix.data(), ctrls, tgts, false);
             }
         } else if (opName == "Matrix") {
-            DataBuffer<CFP_t, int> d_matrix{
-                gate_matrix.size(), BaseType::getDataBuffer().getDevTag(),
-                true};
-            d_matrix.CopyHostDataToGpu(gate_matrix.data(), d_matrix.getLength(),
-                                       false);
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-            applyDeviceMatrixGate_(d_matrix.getData(), ctrls_local, tgts_local,
-                                   adjoint);
+            applyDeviceMatrixGate_(gate_matrix.data(), ctrls, tgts, adjoint);
         } else if (par_gates_.find(opName) != par_gates_.end()) {
             par_gates_.at(opName)(wires, adjoint, params);
         } else { // No offloadable function call; defer to matrix passing
             auto &&par =
                 (params.empty()) ? std::vector<Precision>{0.0} : params;
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-
             if (!gate_cache_.gateExists(opName, par[0]) &&
                 gate_matrix.empty()) {
                 std::string message = "Currently unsupported gate: " + opName +
@@ -346,8 +329,8 @@ class StateVectorCudaManaged
                 gate_cache_.add_gate(opName, par[0], gate_matrix);
             }
             applyDeviceMatrixGate_(
-                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local,
-                tgts_local, adjoint);
+                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls, tgts,
+                adjoint);
         }
     }
 
@@ -432,9 +415,6 @@ class StateVectorCudaManaged
 
                 gate_cache_.add_gate(opName, par[0], matrix_cu);
             }
-            std::reverse(ctrlsInt.begin(), ctrlsInt.end());
-            std::reverse(tgtsInt.begin(), tgtsInt.end());
-            std::reverse(ctrls_valuesInt.begin(), ctrls_valuesInt.end());
             applyDeviceGeneralGate_(
                 gate_cache_.get_gate_device_ptr(opName, par[0]), ctrlsInt,
                 tgtsInt, ctrls_valuesInt, adjoint);
@@ -474,10 +454,6 @@ class StateVectorCudaManaged
         auto ctrls_valuesInt =
             Pennylane::Util::cast_vector<bool, int>(controlled_values);
 
-        std::reverse(ctrlsInt.begin(), ctrlsInt.end());
-        std::reverse(tgtsInt.begin(), tgtsInt.end());
-        std::reverse(ctrls_valuesInt.begin(), ctrls_valuesInt.end());
-
         applyDeviceGeneralGate_(d_matrix.getData(), ctrlsInt, tgtsInt,
                                 ctrls_valuesInt, inverse);
     }
@@ -1620,10 +1596,9 @@ class StateVectorCudaManaged
      * @param ctrls_values Control values.
      * @param use_adjoint Use adjoint of given gate. Defaults to false.
      */
-    void applyDeviceGeneralGate_(const CFP_t *matrix,
-                                 const std::vector<int> &ctrls,
-                                 const std::vector<int> &tgts,
-                                 const std::vector<int> &ctrls_values,
+    void applyDeviceGeneralGate_(const CFP_t *matrix, std::vector<int> &ctrls,
+                                 std::vector<int> &tgts,
+                                 std::vector<int> &ctrls_values,
                                  bool use_adjoint = false) {
         void *extraWorkspace = nullptr;
         std::size_t extraWorkspaceSizeInBytes = 0;
@@ -1641,6 +1616,10 @@ class StateVectorCudaManaged
             compute_type = CUSTATEVEC_COMPUTE_32F;
         }
 
+        std::reverse(tgts.begin(), tgts.end());
+        std::reverse(ctrls.begin(), ctrls.end());
+        std::reverse(ctrls_values.begin(), ctrls_values.end());
+
         // check the size of external workspace
         PL_CUSTATEVEC_IS_SUCCESS(custatevecApplyMatrixGetWorkspaceSize(
             /* custatevecHandle_t */ handle_.get(),

diff --git a/.../core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Generators.cpp b/.../core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Generators.cpp
@@ -793,14 +793,10 @@ TEST_CASE("Generators::applyGeneratorControlledPhaseShift",
 }
 
 TEST_CASE("Generators::applyGeneratorSingleExcitation", "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, -1.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        16, {0.0, 0.0});
+    matrix[6] = {0.0, -1.0};
+    matrix[9] = {0.0, 1.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 2; num_qubits <= 5; num_qubits++) {
@@ -875,14 +871,12 @@ TEST_CASE("Generators::applyGeneratorSingleExcitation", "[GateGenerators]") {
 
 TEST_CASE("Generators::applyGeneratorSingleExcitationMinus",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0,-1.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        16, {0.0, 0.0});
+    matrix[0] = {1.0, 0.0};
+    matrix[6] = {0.0, -1.0};
+    matrix[9] = {0.0, 1.0};
+    matrix[15] = {1.0, 0.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 2; num_qubits <= 5; num_qubits++) {
@@ -957,14 +951,12 @@ TEST_CASE("Generators::applyGeneratorSingleExcitationMinus",
 
 TEST_CASE("Generators::applyGeneratorSingleExcitationPlus",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {-1.0, 0.0},{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0,-1.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        16, {0.0, 0.0});
+    matrix[0] = {-1.0, 0.0};
+    matrix[6] = {0.0, -1.0};
+    matrix[9] = {0.0, 1.0};
+    matrix[15] = {-1.0, 0.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 2; num_qubits <= 5; num_qubits++) {
@@ -1058,26 +1050,10 @@ TEST_CASE("Generators::applyGeneratorDoubleExcitation_GPU",
     */
     // clang-format on
 
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, -1.0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 1.0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        256, {0.0, 0.0});
+    matrix[60] = {0.0, -1.0};
+    matrix[195] = {0.0, 1.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 4; num_qubits <= 8; num_qubits++) {
@@ -1167,26 +1143,16 @@ TEST_CASE("Generators::applyGeneratorDoubleExcitation_GPU",
 
 TEST_CASE("Generators::applyGeneratorDoubleExcitationMinus_GPU",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, -1.0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 1.0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        256, {0.0, 0.0});
+    matrix[60] = {0.0, -1.0};
+    matrix[195] = {0.0, 1.0};
+    for (std::size_t i = 0; i < 16; i++) {
+        if (i != 3 && i != 12) {
+            const size_t idx = i * 17;
+            matrix[idx] = {1.0, 0.0};
+        }
+    }
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 4; num_qubits <= 8; num_qubits++) {
@@ -1276,26 +1242,16 @@ TEST_CASE("Generators::applyGeneratorDoubleExcitationMinus_GPU",
 
 TEST_CASE("Generators::applyGeneratorDoubleExcitationPlus_GPU",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, -1.0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 1.0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        256, {0.0, 0.0});
+    matrix[60] = {0.0, -1.0};
+    matrix[195] = {0.0, 1.0};
+    for (std::size_t i = 0; i < 16; i++) {
+        if (i != 3 && i != 12) {
+            const size_t idx = i * 17;
+            matrix[idx] = {-1.0, 0.0};
+        }
+    }
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 4; num_qubits <= 8; num_qubits++) {