PennyLaneAI · multiphaseCFD · Oct 23, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -46,6 +46,9 @@
 
 ### Improvements
 
+* Optimize `GlobalPhase` and `C(GlobalPhase)` gate implementation in `lightning.gpu`.
+  [(#946)](https://github.com/PennyLaneAI/pennylane-lightning/pull/946)
+
 * Add joint check for the N-controlled wires support in `lightning.qubit`.
   [(#949)](https://github.com/PennyLaneAI/pennylane-lightning/pull/949)
 

diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
@@ -235,6 +235,17 @@ def test_apply_operation_4gatequbit_1param_gate_qnode_param(
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
 
+    @pytest.mark.parametrize(
+        "operation",
+        [qml.GlobalPhase],
+    )
+    @pytest.mark.parametrize("par", [[0.13], [0.2], [0.3]])
+    def test_apply_global_phase(self, tol, operation, par, dev_mpi):
+        """Test applying the GlobalPhase operation."""
+        Wires = range(numQubits)
+
+        apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
+
     # BasisState test
     @pytest.mark.parametrize("operation", [qml.BasisState])
     @pytest.mark.parametrize("index", range(numQubits))

diff --git a/mpitests/test_device.py b/mpitests/test_device.py
@@ -21,6 +21,8 @@
 from conftest import LightningDevice as ld
 from conftest import device_name
 from mpi4py import MPI
+from pennylane import DeviceError
+from pennylane.tape import QuantumScript
 
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
@@ -52,3 +54,15 @@ def test_unsupported_mpi_buf_size():
         match="Number of processes should be smaller than the number of statevector elements",
     ):
         dev = qml.device(device_name, mpi=True, wires=1)
+
+
+def test_unsupported_gate():
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, mpi=True, wires=4)
+    op = qml.ctrl(qml.GlobalPhase(0.1, wires=[1, 2, 3]), [0], control_values=[True])
+    tape = QuantumScript([op])
+    with pytest.raises(
+        DeviceError, match="Lightning-GPU-MPI does not support Controlled GlobalPhase gates"
+    ):
+        dev.execute(tape)
+        comm.Barrier()
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev47"
+__version__ = "0.39.0-dev48"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -386,6 +386,15 @@ class StateVectorCudaMPI final
 
         if (opName == "Identity") {
             return;
+        } else if (opName == "GlobalPhase") {
+            PrecisionT param = adjoint ? -params[0] : params[0];
+            CFP_t scale_factor{std::cos(param), -std::sin(param)};
+            scaleC_CUDA<CFP_t, CFP_t, int>(
+                scale_factor, BaseType::getDataBuffer().getData(),
+                BaseType::getDataBuffer().getLength(),
+                BaseType::getDataBuffer().getDevTag().getDeviceID(),
+                BaseType::getDataBuffer().getDevTag().getStreamID(),
+                getCublasCaller());
         } else if (native_gates_.find(opName) != native_gates_.end()) {
             applyParametricPauliGate({opName}, ctrls, tgts, params.front(),
                                      adjoint);

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -41,6 +41,7 @@
 #include "cuStateVec_helpers.hpp"
 
 #include "LinearAlg.hpp"
+#include "Util.hpp"
 
 /// @cond DEV
 namespace {
@@ -67,25 +68,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const std::size_t index, bool async,
                                cudaStream_t stream_id);
 
-extern void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
-                                        cuComplex phase,
-                                        std::size_t thread_per_block,
-                                        cudaStream_t stream_id);
-extern void globalPhaseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv,
-                                        cuDoubleComplex phase,
-                                        std::size_t thread_per_block,
-                                        cudaStream_t stream_id);
-
-extern void cGlobalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
-                                         bool adjoint, cuComplex *phase,
-                                         std::size_t thread_per_block,
-                                         cudaStream_t stream_id);
-extern void cGlobalPhaseStateVector_CUDA(cuDoubleComplex *sv,
-                                         std::size_t num_sv, bool adjoint,
-                                         cuDoubleComplex *phase,
-                                         std::size_t thread_per_block,
-                                         cudaStream_t stream_id);
-
 /**
  * @brief Managed memory CUDA state-vector class using custateVec backed
  * gate-calls.
@@ -257,44 +239,6 @@ class StateVectorCudaManaged
                                     use_async);
     }
 
-    /**
-     * @brief Multiplies the state-vector by a global phase.
-     *
-     * @param adjoint Indicates whether to use adjoint of gate.
-     * @param param Complex phase generator.
-     */
-    template <std::size_t thread_per_block = 256>
-    void globalPhaseStateVector(const bool adjoint, const Precision param) {
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-        std::complex<Precision> phase =
-            std::exp(std::complex<Precision>{0, (adjoint) ? param : -param});
-        auto cuPhase = complexToCu(phase);
-        globalPhaseStateVector_CUDA(BaseType::getData(), BaseType::getLength(),
-                                    cuPhase, thread_per_block, stream_id);
-    }
-
-    /**
-     * @brief Multiplies the state-vector by a controlled global phase.
-     *
-     * @param phase Controlled complex phase vector.
-     */
-    template <std::size_t thread_per_block = 256>
-    void cGlobalPhaseStateVector(const bool adjoint,
-                                 const std::vector<CFP_t> &phase,
-                                 const bool async = false) {
-        PL_ABORT_IF_NOT(BaseType::getLength() == phase.size(),
-                        "The state-vector data must have the same size as the "
-                        "controlled-phase data.")
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-        DataBuffer<CFP_t, int> d_phase{phase.size(), device_id, stream_id,
-                                       true};
-        d_phase.CopyHostDataToGpu(phase.data(), d_phase.getLength(), async);
-        cGlobalPhaseStateVector_CUDA(BaseType::getData(), BaseType::getLength(),
-                                     adjoint, d_phase.getData(),
-                                     thread_per_block, stream_id);
-    }
-
     /**
      * @brief Apply a single gate to the state-vector. Offloads to custatevec
      * specific API calls if available. If unable, attempts to use prior cached
@@ -347,13 +291,18 @@ class StateVectorCudaManaged
                                             wires.end()};
         if (opName == "Identity") {
             return;
-        } else if (opName == "C(GlobalPhase)") {
-            cGlobalPhaseStateVector(adjoint, gate_matrix);
         } else if (opName == "GlobalPhase") {
-            globalPhaseStateVector(adjoint, params[0]);
+            PrecisionT param = adjoint ? -params[0] : params[0];
+            CFP_t scale_factor{std::cos(param), -std::sin(param)};
+            scaleC_CUDA<CFP_t, CFP_t, int>(
+                scale_factor, BaseType::getDataBuffer().getData(),
+                BaseType::getDataBuffer().getLength(),
+                BaseType::getDataBuffer().getDevTag().getDeviceID(),
+                BaseType::getDataBuffer().getDevTag().getStreamID(),
+                getCublasCaller());
         } else if (native_gates_.find(opName) != native_gates_.end()) {
-            applyParametricPauliGate({opName}, ctrls, tgts, params.front(),
-                                     adjoint);
+            applyParametricPauliGate_({opName}, ctrls, tgts, params.front(),
+                                      adjoint);
         } else if (opName == "Rot" || opName == "CRot") {
             if (adjoint) {
                 auto rot_matrix =
@@ -407,25 +356,34 @@ class StateVectorCudaManaged
      * @param opName Name of gate to apply.
      * @param controlled_wires Control wires.
      * @param controlled_values Control values (false or true).
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicates whether to use adjoint of gate.
+     * @param tgt_wires Wires to apply gate to.
+     * @param adjoint Indicates whether to use adjoint of gate.
      * @param params Optional parameter list for parametric gates.
-     * @param params Optional std gate matrix if opName doesn't exist.
+     * @param gate_matrix Optional std gate matrix if opName doesn't exist.
      */
-    template <template <typename...> class complex_t>
-    void
-    applyOperation(const std::string &opName,
-                   const std::vector<std::size_t> &controlled_wires,
-                   const std::vector<bool> &controlled_values,
-                   const std::vector<std::size_t> &wires, bool inverse = false,
-                   const std::vector<Precision> &params = {0.0},
-                   const std::vector<complex_t<Precision>> &gate_matrix = {}) {
-        PL_ABORT_IF_NOT(controlled_wires.empty(),
-                        "Controlled kernels not implemented.");
-        PL_ABORT_IF_NOT(controlled_wires.size() == controlled_values.size(),
-                        "`controlled_wires` must have the same size as "
-                        "`controlled_values`.");
-        applyOperation(opName, wires, inverse, params, gate_matrix);
+    void applyOperation(
+        const std::string &opName,
+        const std::vector<std::size_t> &controlled_wires,
+        const std::vector<bool> &controlled_values,
+        const std::vector<std::size_t> &tgt_wires, bool adjoint = false,
+        const std::vector<Precision> &params = {0.0},
+        [[maybe_unused]] const std::vector<ComplexT> &gate_matrix = {}) {
+        PL_ABORT_IF_NOT(opName == "GlobalPhase",
+                        "Only GlobalPhase gate is supported.");
+        PL_ABORT_IF(controlled_wires.size() != controlled_values.size(),
+                    "`ctrls` and `ctrls_values` must have the same size.");
+        auto ctrlsInt = NormalizeCastIndices<std::size_t, int>(
+            controlled_wires, BaseType::getNumQubits());
+        auto tgtsInt = NormalizeCastIndices<std::size_t, int>(
+            tgt_wires, BaseType::getNumQubits());
+        auto ctrls_valuesInt =
+            Pennylane::Util::cast_vector<bool, int>(controlled_values);
+
+        if (opName == "GlobalPhase") {
+            const std::vector<std::string> names(tgt_wires.size(), "I");
+            applyParametricPauliGeneralGate_(names, ctrlsInt, ctrls_valuesInt,
+                                             tgtsInt, 2 * params[0], adjoint);
+        }
     }
 
     /**
@@ -595,20 +553,20 @@ class StateVectorCudaManaged
     inline void applyRX(const std::vector<std::size_t> &wires, bool adjoint,
                         Precision param) {
         static const std::vector<std::string> name{{"RX"}};
-        applyParametricPauliGate(name, {wires.begin(), wires.end() - 1},
-                                 {wires.back()}, param, adjoint);
+        applyParametricPauliGate_(name, {wires.begin(), wires.end() - 1},
+                                  {wires.back()}, param, adjoint);
     }
     inline void applyRY(const std::vector<std::size_t> &wires, bool adjoint,
                         Precision param) {
         static const std::vector<std::string> name{{"RY"}};
-        applyParametricPauliGate(name, {wires.begin(), wires.end() - 1},
-                                 {wires.back()}, param, adjoint);
+        applyParametricPauliGate_(name, {wires.begin(), wires.end() - 1},
+                                  {wires.back()}, param, adjoint);
     }
     inline void applyRZ(const std::vector<std::size_t> &wires, bool adjoint,
                         Precision param) {
         static const std::vector<std::string> name{{"RZ"}};
-        applyParametricPauliGate(name, {wires.begin(), wires.end() - 1},
-                                 {wires.back()}, param, adjoint);
+        applyParametricPauliGate_(name, {wires.begin(), wires.end() - 1},
+                                  {wires.back()}, param, adjoint);
     }
     inline void applyRot(const std::vector<std::size_t> &wires, bool adjoint,
                          Precision param0, Precision param1, Precision param2) {
@@ -664,17 +622,17 @@ class StateVectorCudaManaged
     inline void applyIsingXX(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         static const std::vector<std::string> names(wires.size(), {"RX"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
     inline void applyIsingYY(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         static const std::vector<std::string> names(wires.size(), {"RY"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
     inline void applyIsingZZ(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         static const std::vector<std::string> names(wires.size(), {"RZ"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
     inline void applyIsingXY(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
@@ -789,7 +747,7 @@ class StateVectorCudaManaged
     inline void applyMultiRZ(const std::vector<std::size_t> &wires,
                              bool adjoint, Precision param) {
         const std::vector<std::string> names(wires.size(), {"RZ"});
-        applyParametricPauliGate(names, {}, wires, param, adjoint);
+        applyParametricPauliGate_(names, {}, wires, param, adjoint);
     }
 
     /* Gate generators */
@@ -1458,30 +1416,44 @@ class StateVectorCudaManaged
     /**
      * @brief Apply parametric Pauli gates using custateVec calls.
      *
-     * @param angle Rotation angle.
      * @param pauli_words List of Pauli words representing operation.
      * @param ctrls Control wires
      * @param tgts target wires.
+     * @param params Rotation parameters.
      * @param use_adjoint Take adjoint of operation.
      */
-    void applyParametricPauliGate(const std::vector<std::string> &pauli_words,
-                                  std::vector<std::size_t> ctrls,
-                                  std::vector<std::size_t> tgts,
-                                  Precision param, bool use_adjoint = false) {
-        int nIndexBits = BaseType::getNumQubits();
+    void applyParametricPauliGate_(const std::vector<std::string> &pauli_words,
+                                   std::vector<std::size_t> ctrls,
+                                   std::vector<std::size_t> tgts,
+                                   Precision param, bool use_adjoint = false) {
+        // Transform indices between PL & cuQuantum ordering
+        auto ctrlsInt = NormalizeCastIndices<std::size_t, int>(
+            ctrls, BaseType::getNumQubits());
+        auto tgtsInt = NormalizeCastIndices<std::size_t, int>(
+            tgts, BaseType::getNumQubits());
 
-        std::vector<int> ctrlsInt(ctrls.size());
-        std::vector<int> tgtsInt(tgts.size());
+        const std::vector<int> ctrls_valuesInt(ctrls.size(), 1);
 
-        // Transform indices between PL & cuQuantum ordering
-        std::transform(
-            ctrls.begin(), ctrls.end(), ctrlsInt.begin(), [&](std::size_t x) {
-                return static_cast<int>(BaseType::getNumQubits() - 1 - x);
-            });
-        std::transform(
-            tgts.begin(), tgts.end(), tgtsInt.begin(), [&](std::size_t x) {
-                return static_cast<int>(BaseType::getNumQubits() - 1 - x);
-            });
+        applyParametricPauliGeneralGate_(pauli_words, ctrlsInt, ctrls_valuesInt,
+                                         tgtsInt, param, use_adjoint);
+    }
+
+    /**
+     * @brief Apply a parametric Pauli gate using custateVec calls.
+     *
+     * @param pauli_words List of Pauli words representing operation.
+     * @param ctrlsInt Control wires
+     * @param ctrls_valuesInt Control values
+     * @param tgtsInt target wires.
+     * @param param Rotation angle.
+     * @param use_adjoint Take adjoint of operation.
+     */
+    void applyParametricPauliGeneralGate_(
+        const std::vector<std::string> &pauli_words,
+        const std::vector<int> &ctrlsInt,
+        const std::vector<int> &ctrls_valuesInt, const std::vector<int> tgtsInt,
+        Precision param, bool use_adjoint = false) {
+        int nIndexBits = BaseType::getNumQubits();
 
         cudaDataType_t data_type;
 
@@ -1491,14 +1463,12 @@ class StateVectorCudaManaged
         } else {
             data_type = CUDA_C_32F;
         }
-
         std::vector<custatevecPauli_t> pauli_enums;
         pauli_enums.reserve(pauli_words.size());
         for (const auto &pauli_str : pauli_words) {
             pauli_enums.push_back(native_gates_.at(pauli_str));
         }
         const auto local_angle = (use_adjoint) ? param / 2 : -param / 2;
-
         PL_CUSTATEVEC_IS_SUCCESS(custatevecApplyPauliRotation(
             /* custatevecHandle_t */ handle_.get(),
             /* void* */ BaseType::getData(),
@@ -1507,10 +1477,10 @@ class StateVectorCudaManaged
             /* double */ local_angle,
             /* const custatevecPauli_t* */ pauli_enums.data(),
             /* const int32_t* */ tgtsInt.data(),
-            /* const uint32_t */ tgts.size(),
+            /* const uint32_t */ tgtsInt.size(),
             /* const int32_t* */ ctrlsInt.data(),
-            /* const int32_t* */ nullptr,
-            /* const uint32_t */ ctrls.size()));
+            /* const int32_t* */ ctrls_valuesInt.data(),
+            /* const uint32_t */ ctrlsInt.size()));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             BaseType::getDataBuffer().getDevTag().getStreamID()));
     }

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -152,6 +152,17 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             "Initialize the statevector data to the |0...0> state")
         .def("collapse", &StateVectorT::collapse,
              "Collapse the statevector onto the 0 or 1 branch of a given wire.")
+        .def(
+            "apply",
+            [](StateVectorT &sv, const std::string &gate_name,
+               const std::vector<std::size_t> &controlled_wires,
+               const std::vector<bool> &controlled_values,
+               const std::vector<std::size_t> &wires, bool inverse,
+               const std::vector<ParamT> &params) {
+                sv.applyOperation(gate_name, controlled_wires,
+                                  controlled_values, wires, inverse, params);
+            },
+            "Apply operation via the gate matrix")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,