diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 423c10879..a3d437563 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -9,6 +9,9 @@
 
 ### Improvements
 
+* Unify excitation gates memory layout to row-major for both LGPU and LT.
+  [(#959)](https://github.com/PennyLaneAI/pennylane-lightning/pull/959)
+
 * Update the `lightning.kokkos` CUDA backend for compatibility with Catalyst.
   [(#942)](https://github.com/PennyLaneAI/pennylane-lightning/pull/942)
 
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index e410f98f4..8432c4f82 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.40.0-dev3"
+__version__ = "0.40.0-dev4"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index f9c0f0116..6fbf9d8b8 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -399,39 +399,18 @@ class StateVectorCudaMPI final
             applyParametricPauliGate({opName}, ctrls, tgts, params.front(),
                                      adjoint);
         } else if (opName == "Rot" || opName == "CRot") {
-            if (adjoint) {
-                auto rot_matrix =
-                    cuGates::getRot<CFP_t>(params[2], params[1], params[0]);
-                applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, true);
-            } else {
-                auto rot_matrix =
-                    cuGates::getRot<CFP_t>(params[0], params[1], params[2]);
-                applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, false);
-            }
+            auto rot_matrix =
+                adjoint
+                    ? cuGates::getRot<CFP_t>(params[2], params[1], params[0])
+                    : cuGates::getRot<CFP_t>(params[0], params[1], params[2]);
+            applyDeviceMatrixGate(rot_matrix.data(), ctrls, tgts, adjoint);
         } else if (opName == "Matrix") {
-            DataBuffer<CFP_t, int> d_matrix{
-                gate_matrix.size(), BaseType::getDataBuffer().getDevTag(),
-                true};
-            d_matrix.CopyHostDataToGpu(gate_matrix.data(), d_matrix.getLength(),
-                                       false);
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-            applyDeviceMatrixGate(d_matrix.getData(), ctrls_local, tgts_local,
-                                  adjoint);
+            applyDeviceMatrixGate(gate_matrix.data(), ctrls, tgts, adjoint);
         } else if (par_gates_.find(opName) != par_gates_.end()) {
             par_gates_.at(opName)(wires, adjoint, params);
         } else { // No offloadable function call; defer to matrix passing
             auto &&par =
                 (params.empty()) ? std::vector<Precision>{0.0} : params;
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-
             if (!gate_cache_.gateExists(opName, par[0]) &&
                 gate_matrix.empty()) {
                 std::string message = "Currently unsupported gate: " + opName;
@@ -440,8 +419,8 @@ class StateVectorCudaMPI final
                 gate_cache_.add_gate(opName, par[0], gate_matrix);
             }
             applyDeviceMatrixGate(
-                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local,
-                tgts_local, adjoint);
+                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls, tgts,
+                adjoint);
         }
     }
 
@@ -1826,9 +1805,8 @@ class StateVectorCudaMPI final
      * @param tgts Target qubits.
      * @param use_adjoint Use adjoint of given gate.
      */
-    void applyCuSVDeviceMatrixGate(const CFP_t *matrix,
-                                   const std::vector<int> &ctrls,
-                                   const std::vector<int> &tgts,
+    void applyCuSVDeviceMatrixGate(const CFP_t *matrix, std::vector<int> &ctrls,
+                                   std::vector<int> &tgts,
                                    bool use_adjoint = false) {
         void *extraWorkspace = nullptr;
         std::size_t extraWorkspaceSizeInBytes = 0;
@@ -1846,6 +1824,9 @@ class StateVectorCudaMPI final
             compute_type = CUSTATEVEC_COMPUTE_32F;
         }
 
+        std::reverse(tgts.begin(), tgts.end());
+        std::reverse(ctrls.begin(), ctrls.end());
+
         // check the size of external workspace
         PL_CUSTATEVEC_IS_SUCCESS(custatevecApplyMatrixGetWorkspaceSize(
             /* custatevecHandle_t */ handle_.get(),
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index d66e32c2e..d354133be 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -314,29 +314,12 @@ class StateVectorCudaManaged
                 applyDeviceMatrixGate_(rot_matrix.data(), ctrls, tgts, false);
             }
         } else if (opName == "Matrix") {
-            DataBuffer<CFP_t, int> d_matrix{
-                gate_matrix.size(), BaseType::getDataBuffer().getDevTag(),
-                true};
-            d_matrix.CopyHostDataToGpu(gate_matrix.data(), d_matrix.getLength(),
-                                       false);
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-            applyDeviceMatrixGate_(d_matrix.getData(), ctrls_local, tgts_local,
-                                   adjoint);
+            applyDeviceMatrixGate_(gate_matrix.data(), ctrls, tgts, adjoint);
         } else if (par_gates_.find(opName) != par_gates_.end()) {
             par_gates_.at(opName)(wires, adjoint, params);
         } else { // No offloadable function call; defer to matrix passing
             auto &&par =
                 (params.empty()) ? std::vector<Precision>{0.0} : params;
-            // ensure wire indexing correctly preserved for tensor-observables
-            const std::vector<std::size_t> ctrls_local{ctrls.rbegin(),
-                                                       ctrls.rend()};
-            const std::vector<std::size_t> tgts_local{tgts.rbegin(),
-                                                      tgts.rend()};
-
             if (!gate_cache_.gateExists(opName, par[0]) &&
                 gate_matrix.empty()) {
                 std::string message = "Currently unsupported gate: " + opName +
@@ -346,8 +329,8 @@ class StateVectorCudaManaged
                 gate_cache_.add_gate(opName, par[0], gate_matrix);
             }
             applyDeviceMatrixGate_(
-                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local,
-                tgts_local, adjoint);
+                gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls, tgts,
+                adjoint);
         }
     }
 
@@ -432,9 +415,6 @@ class StateVectorCudaManaged
 
                 gate_cache_.add_gate(opName, par[0], matrix_cu);
             }
-            std::reverse(ctrlsInt.begin(), ctrlsInt.end());
-            std::reverse(tgtsInt.begin(), tgtsInt.end());
-            std::reverse(ctrls_valuesInt.begin(), ctrls_valuesInt.end());
             applyDeviceGeneralGate_(
                 gate_cache_.get_gate_device_ptr(opName, par[0]), ctrlsInt,
                 tgtsInt, ctrls_valuesInt, adjoint);
@@ -474,10 +454,6 @@ class StateVectorCudaManaged
         auto ctrls_valuesInt =
             Pennylane::Util::cast_vector<bool, int>(controlled_values);
 
-        std::reverse(ctrlsInt.begin(), ctrlsInt.end());
-        std::reverse(tgtsInt.begin(), tgtsInt.end());
-        std::reverse(ctrls_valuesInt.begin(), ctrls_valuesInt.end());
-
         applyDeviceGeneralGate_(d_matrix.getData(), ctrlsInt, tgtsInt,
                                 ctrls_valuesInt, inverse);
     }
@@ -1620,10 +1596,9 @@ class StateVectorCudaManaged
      * @param ctrls_values Control values.
      * @param use_adjoint Use adjoint of given gate. Defaults to false.
      */
-    void applyDeviceGeneralGate_(const CFP_t *matrix,
-                                 const std::vector<int> &ctrls,
-                                 const std::vector<int> &tgts,
-                                 const std::vector<int> &ctrls_values,
+    void applyDeviceGeneralGate_(const CFP_t *matrix, std::vector<int> &ctrls,
+                                 std::vector<int> &tgts,
+                                 std::vector<int> &ctrls_values,
                                  bool use_adjoint = false) {
         void *extraWorkspace = nullptr;
         std::size_t extraWorkspaceSizeInBytes = 0;
@@ -1641,6 +1616,10 @@ class StateVectorCudaManaged
             compute_type = CUSTATEVEC_COMPUTE_32F;
         }
 
+        std::reverse(tgts.begin(), tgts.end());
+        std::reverse(ctrls.begin(), ctrls.end());
+        std::reverse(ctrls_values.begin(), ctrls_values.end());
+
         // check the size of external workspace
         PL_CUSTATEVEC_IS_SUCCESS(custatevecApplyMatrixGetWorkspaceSize(
             /* custatevecHandle_t */ handle_.get(),
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Generators.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Generators.cpp
index 1129e5a66..a5aba04eb 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Generators.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_Generators.cpp
@@ -793,14 +793,10 @@ TEST_CASE("Generators::applyGeneratorControlledPhaseShift",
 }
 
 TEST_CASE("Generators::applyGeneratorSingleExcitation", "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, -1.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        16, {0.0, 0.0});
+    matrix[6] = {0.0, -1.0};
+    matrix[9] = {0.0, 1.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 2; num_qubits <= 5; num_qubits++) {
@@ -875,14 +871,12 @@ TEST_CASE("Generators::applyGeneratorSingleExcitation", "[GateGenerators]") {
 
 TEST_CASE("Generators::applyGeneratorSingleExcitationMinus",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0,-1.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        16, {0.0, 0.0});
+    matrix[0] = {1.0, 0.0};
+    matrix[6] = {0.0, -1.0};
+    matrix[9] = {0.0, 1.0};
+    matrix[15] = {1.0, 0.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 2; num_qubits <= 5; num_qubits++) {
@@ -957,14 +951,12 @@ TEST_CASE("Generators::applyGeneratorSingleExcitationMinus",
 
 TEST_CASE("Generators::applyGeneratorSingleExcitationPlus",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {-1.0, 0.0},{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0,-1.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}, {0.0, 0.0},
-        {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        16, {0.0, 0.0});
+    matrix[0] = {-1.0, 0.0};
+    matrix[6] = {0.0, -1.0};
+    matrix[9] = {0.0, 1.0};
+    matrix[15] = {-1.0, 0.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 2; num_qubits <= 5; num_qubits++) {
@@ -1058,26 +1050,10 @@ TEST_CASE("Generators::applyGeneratorDoubleExcitation_GPU",
     */
     // clang-format on
 
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, -1.0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 1.0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        256, {0.0, 0.0});
+    matrix[60] = {0.0, -1.0};
+    matrix[195] = {0.0, 1.0};
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 4; num_qubits <= 8; num_qubits++) {
@@ -1167,26 +1143,16 @@ TEST_CASE("Generators::applyGeneratorDoubleExcitation_GPU",
 
 TEST_CASE("Generators::applyGeneratorDoubleExcitationMinus_GPU",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, -1.0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 1.0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{1.0, 0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        256, {0.0, 0.0});
+    matrix[60] = {0.0, -1.0};
+    matrix[195] = {0.0, 1.0};
+    for (std::size_t i = 0; i < 16; i++) {
+        if (i != 3 && i != 12) {
+            const size_t idx = i * 17;
+            matrix[idx] = {1.0, 0.0};
+        }
+    }
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 4; num_qubits <= 8; num_qubits++) {
@@ -1276,26 +1242,16 @@ TEST_CASE("Generators::applyGeneratorDoubleExcitationMinus_GPU",
 
 TEST_CASE("Generators::applyGeneratorDoubleExcitationPlus_GPU",
           "[GateGenerators]") {
-    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix{
-        // clang-format off
-        {-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, -1.0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 1.0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0},{0, 0},
-        {0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{0, 0},{-1.0, 0}
-        // clang-format on
-    };
+    std::vector<typename StateVectorCudaManaged<double>::CFP_t> matrix(
+        256, {0.0, 0.0});
+    matrix[60] = {0.0, -1.0};
+    matrix[195] = {0.0, 1.0};
+    for (std::size_t i = 0; i < 16; i++) {
+        if (i != 3 && i != 12) {
+            const size_t idx = i * 17;
+            matrix[idx] = {-1.0, 0.0};
+        }
+    }
     std::mt19937 re{1337U};
 
     for (std::size_t num_qubits = 4; num_qubits <= 8; num_qubits++) {
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/cuGates_host.hpp b/pennylane_lightning/core/src/utils/cuda_utils/cuGates_host.hpp
index 53b712ed5..4d09555d7 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/cuGates_host.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/cuGates_host.hpp
@@ -654,15 +654,8 @@ template <class CFP_t, class U = double>
 static auto getSingleExcitation(U angle) -> std::vector<CFP_t> {
     const U p2 = angle / 2;
     const CFP_t c{std::cos(p2), 0};
-    // TODO: To remove conditional compilation here in the future, current
-    // implementation will block the simultaneous installation of LGPU and
-    // cutensornet backends
-
-#ifdef _ENABLE_PLGPU
-    const CFP_t s{-std::sin(p2), 0}; // column-major
-#else
     const CFP_t s{std::sin(p2), 0}; // row-major
-#endif
+
     return {cuUtil::ONE<CFP_t>(),
             cuUtil::ZERO<CFP_t>(),
             cuUtil::ZERO<CFP_t>(),
@@ -708,17 +701,17 @@ static auto getSingleExcitation(const std::vector<U> &params)
 template <class CFP_t, class U = double>
 static constexpr auto getGeneratorSingleExcitation() -> std::vector<CFP_t> {
     return {
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::IMAG<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        -cuUtil::IMAG<CFP_t>(), cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), -cuUtil::IMAG<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::IMAG<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
     };
 }
 
@@ -739,14 +732,7 @@ static auto getSingleExcitationMinus(U angle) -> std::vector<CFP_t> {
     const CFP_t e =
         cuUtil::complexToCu<std::complex<U>>(std::exp(std::complex<U>(0, -p2)));
     const CFP_t c{std::cos(p2), 0};
-// TODO: To remove conditional compilation here in the future, current
-// implementation will block the simultaneous installation of LGPU and
-// cutensornet backends
-#ifdef _ENABLE_PLGPU
-    const CFP_t s{-std::sin(p2), 0}; // column-major
-#else
     const CFP_t s{std::sin(p2), 0}; // row-major
-#endif
 
     return {e,
             cuUtil::ZERO<CFP_t>(),
@@ -795,17 +781,17 @@ template <class CFP_t, class U = double>
 static constexpr auto getGeneratorSingleExcitationMinus()
     -> std::vector<CFP_t> {
     return {
-        cuUtil::ONE<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ONE<CFP_t>(),   cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::IMAG<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        -cuUtil::IMAG<CFP_t>(), cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), -cuUtil::IMAG<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::IMAG<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ONE<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ONE<CFP_t>(),
     };
 }
 
@@ -826,15 +812,8 @@ static auto getSingleExcitationPlus(U angle) -> std::vector<CFP_t> {
     const CFP_t e =
         cuUtil::complexToCu<std::complex<U>>(std::exp(std::complex<U>(0, p2)));
     const CFP_t c{std::cos(p2), 0};
-    // TODO: To remove conditional compilation here in the future, current
-    // implementation will block the simultaneous installation of LGPU and
-    // cutensornet backends
-
-#ifdef _ENABLE_PLGPU
-    const CFP_t s{-std::sin(p2), 0}; // column-major
-#else
     const CFP_t s{std::sin(p2), 0}; // row-major
-#endif
+
     return {e,
             cuUtil::ZERO<CFP_t>(),
             cuUtil::ZERO<CFP_t>(),
@@ -881,17 +860,17 @@ static auto getSingleExcitationPlus(const std::vector<U> &params)
 template <class CFP_t, class U = double>
 static constexpr auto getGeneratorSingleExcitationPlus() -> std::vector<CFP_t> {
     return {
-        -cuUtil::ONE<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        -cuUtil::ONE<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::IMAG<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        -cuUtil::IMAG<CFP_t>(), cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), -cuUtil::IMAG<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::IMAG<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
 
-        cuUtil::ZERO<CFP_t>(), cuUtil::ZERO<CFP_t>(),
-        cuUtil::ZERO<CFP_t>(), -cuUtil::ONE<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  cuUtil::ZERO<CFP_t>(),
+        cuUtil::ZERO<CFP_t>(),  -cuUtil::ONE<CFP_t>(),
     };
 }
 
@@ -909,15 +888,8 @@ template <class CFP_t, class U = double>
 static auto getDoubleExcitation(U angle) -> std::vector<CFP_t> {
     const U p2 = angle / 2;
     const CFP_t c{std::cos(p2), 0};
-    // TODO: To remove conditional compilation here in the future, current
-    // implementation will block the simultaneous installation of LGPU and
-    // cutensornet backends
-
-#ifdef _ENABLE_PLGPU
-    const CFP_t s{-std::sin(p2), 0}; // column-major
-#else
     const CFP_t s{std::sin(p2), 0}; // row-major
-#endif
+
     std::vector<CFP_t> mat(256, cuUtil::ZERO<CFP_t>());
     mat[0] = cuUtil::ONE<CFP_t>();
     mat[17] = cuUtil::ONE<CFP_t>();
@@ -967,8 +939,8 @@ static auto getDoubleExcitation(const std::vector<U> &params)
 template <class CFP_t, class U = double>
 static constexpr auto getGeneratorDoubleExcitation() -> std::vector<CFP_t> {
     std::vector<CFP_t> mat(256, cuUtil::ZERO<CFP_t>());
-    mat[60] = cuUtil::IMAG<CFP_t>();
-    mat[195] = -cuUtil::IMAG<CFP_t>();
+    mat[60] = -cuUtil::IMAG<CFP_t>();
+    mat[195] = cuUtil::IMAG<CFP_t>();
     return mat;
 }
 
@@ -989,15 +961,8 @@ static auto getDoubleExcitationMinus(U angle) -> std::vector<CFP_t> {
     const CFP_t e =
         cuUtil::complexToCu<std::complex<U>>(std::exp(std::complex<U>(0, -p2)));
     const CFP_t c{std::cos(p2), 0};
-    // TODO: To remove conditional compilation here in the future, current
-    // implementation will block the simultaneous installation of LGPU and
-    // cutensornet backends
-
-#ifdef _ENABLE_PLGPU
-    const CFP_t s{-std::sin(p2), 0}; // column-major
-#else
     const CFP_t s{std::sin(p2), 0}; // row-major
-#endif
+
     std::vector<CFP_t> mat(256, cuUtil::ZERO<CFP_t>());
     mat[0] = e;
     mat[17] = e;
@@ -1052,7 +1017,7 @@ static constexpr auto getGeneratorDoubleExcitationMinus()
     mat[0] = cuUtil::ONE<CFP_t>();
     mat[17] = cuUtil::ONE<CFP_t>();
     mat[34] = cuUtil::ONE<CFP_t>();
-    mat[60] = cuUtil::IMAG<CFP_t>();
+    mat[60] = -cuUtil::IMAG<CFP_t>();
     mat[68] = cuUtil::ONE<CFP_t>();
     mat[85] = cuUtil::ONE<CFP_t>();
     mat[102] = cuUtil::ONE<CFP_t>();
@@ -1061,7 +1026,7 @@ static constexpr auto getGeneratorDoubleExcitationMinus()
     mat[153] = cuUtil::ONE<CFP_t>();
     mat[170] = cuUtil::ONE<CFP_t>();
     mat[187] = cuUtil::ONE<CFP_t>();
-    mat[195] = -cuUtil::IMAG<CFP_t>();
+    mat[195] = cuUtil::IMAG<CFP_t>();
     mat[221] = cuUtil::ONE<CFP_t>();
     mat[238] = cuUtil::ONE<CFP_t>();
     mat[255] = cuUtil::ONE<CFP_t>();
@@ -1085,14 +1050,8 @@ static auto getDoubleExcitationPlus(U angle) -> std::vector<CFP_t> {
     const CFP_t e =
         cuUtil::complexToCu<std::complex<U>>(std::exp(std::complex<U>(0, p2)));
     const CFP_t c{std::cos(p2), 0};
-    // TODO: To remove conditional compilation here in the future, current
-    // implementation will block the simultaneous installation of LGPU and
-    // cutensornet backends
-#ifdef _ENABLE_PLGPU
-    const CFP_t s{-std::sin(p2), 0}; // column-major
-#else
     const CFP_t s{std::sin(p2), 0}; // row-major
-#endif
+
     std::vector<CFP_t> mat(256, cuUtil::ZERO<CFP_t>());
     mat[0] = e;
     mat[17] = e;
@@ -1146,7 +1105,7 @@ static constexpr auto getGeneratorDoubleExcitationPlus() -> std::vector<CFP_t> {
     mat[0] = -cuUtil::ONE<CFP_t>();
     mat[17] = -cuUtil::ONE<CFP_t>();
     mat[34] = -cuUtil::ONE<CFP_t>();
-    mat[60] = cuUtil::IMAG<CFP_t>();
+    mat[60] = -cuUtil::IMAG<CFP_t>();
     mat[68] = -cuUtil::ONE<CFP_t>();
     mat[85] = -cuUtil::ONE<CFP_t>();
     mat[102] = -cuUtil::ONE<CFP_t>();
@@ -1155,7 +1114,7 @@ static constexpr auto getGeneratorDoubleExcitationPlus() -> std::vector<CFP_t> {
     mat[153] = -cuUtil::ONE<CFP_t>();
     mat[170] = -cuUtil::ONE<CFP_t>();
     mat[187] = -cuUtil::ONE<CFP_t>();
-    mat[195] = -cuUtil::IMAG<CFP_t>();
+    mat[195] = cuUtil::IMAG<CFP_t>();
     mat[221] = -cuUtil::ONE<CFP_t>();
     mat[238] = -cuUtil::ONE<CFP_t>();
     mat[255] = -cuUtil::ONE<CFP_t>();