NVIDIA · faradawn · Sep 29, 2025 · Sep 25, 2025 · Sep 28, 2025 · Sep 29, 2025
@@ -28,3 +28,4 @@ jobs:
           labels-to-remove-when-unstale: 'stale,waiting for feedback'
           stale-issue-label: 'stale'
           stale-pr-label: 'stale'
+          operations-per-run: 1000
diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ tensorrt_llm/deep_ep_cpp_tllm.pyi
 tensorrt_llm/deep_gemm/
 tensorrt_llm/deep_gemm_cpp_tllm.*.so
 tensorrt_llm/deep_gemm_cpp_tllm.pyi
+tensorrt_llm/pg_utils_bindings.*.so
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst

diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
@@ -23,9 +23,17 @@
 #include "tensorrt_llm/executor/cacheCommunicator.h"
 #include "tensorrt_llm/executor/dataTransceiverState.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
+#include "tensorrt_llm/runtime/utils/pgUtils.h"
 #include <future>
-#include <map>
 #include <memory>
+#include <mutex>
+#include <optional>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/custom_class.h>
+#include <torch/python.h>
+#include <type_traits>
+#include <vector>
 
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
@@ -43,6 +51,134 @@ class BaseKVCacheManager;
 class CacheSender;
 class CacheReceiver;
 
+class CacheTransceiverComm
+{
+public:
+    // Construct from a non-owning raw pointer, won't take ownership of the pointer
+    explicit CacheTransceiverComm(mpi::MpiComm const* mpiComm)
+        : mMpiComm(std::shared_ptr<mpi::MpiComm const>(nullptr), mpiComm)
+    {
+    }
+
+    // Construct from a shared_ptr with shared ownership
+    explicit CacheTransceiverComm(std::shared_ptr<mpi::MpiComm const> mpiComm)
+        : mMpiComm(std::move(mpiComm))
+    {
+    }
+
+    // Construct from a ProcessGroup communicator
+    explicit CacheTransceiverComm(c10::intrusive_ptr<c10d::ProcessGroup> pgComm)
+        : mPgComm(std::move(pgComm))
+    {
+    }
+
+    ~CacheTransceiverComm() = default;
+
+    bool isMpi() const noexcept
+    {
+        return mMpiComm != nullptr;
+    }
+
+    int getRank() const
+    {
+        if (isMpi())
+        {
+            return mMpiComm->getRank();
+        }
+        return mPgComm->getRank();
+    }
+
+    int getSize() const
+    {
+        if (isMpi())
+        {
+            return mMpiComm->getSize();
+        }
+        return mPgComm->getSize();
+    }
+
+    void allgather(void const* sendbuf, void* recvbuf, int count, mpi::MpiType dtype) const
+    {
+        if (isMpi())
+        {
+            mMpiComm->allgather(sendbuf, recvbuf, count, dtype);
+            return;
+        }
+        TLLM_THROW("Input arguments only supported in mpi");
+    }
+
+    template <typename Input, typename Output>
+    bool allgather(Input input, Output output, c10d::AllgatherOptions options = c10d::AllgatherOptions()) const
+    {
+        if (isMpi())
+        {
+            TLLM_THROW("Input arguments only supported in pg");
+        }
+        tensorrt_llm::pg_utils::PgHelper pgh{mPgComm};
+
+        PGCHECK_THROW(pgh.allgather(input, output, options));
+        return true;
+    }
+
+    template <typename Input, typename Output>
+    bool allgatherv(Input input, Output output, std::vector<int> const& sizes,
+        c10d::AllgatherOptions options = c10d::AllgatherOptions()) const
+    {
+        if (isMpi())
+        {
+            TLLM_THROW("Input arguments only supported in pg");
+        }
+        tensorrt_llm::pg_utils::PgHelper pgh{mPgComm};
+        PGCHECK_THROW(pgh.allgatherv(input, output, sizes, options));
+        return true;
+    }
+
+    bool allgatherv(void const* sendbuf, int sendcount, mpi::MpiType sendtype, void* recvbuf,
+        std::vector<int> const& recvcounts, std::vector<int> const& displs, mpi::MpiType recvtype) const
+    {
+        if (isMpi())
+        {
+            mMpiComm->allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype);
+            return true;
+        }
+        TLLM_THROW("Input arguments only supported in mpi");
+    }
+
+    CacheTransceiverComm split(int color, int key)
+    {
+        if (isMpi())
+        {
+            auto subgroup = mMpiComm->split(color, key);
+            return CacheTransceiverComm(std::make_shared<mpi::MpiComm const>(std::move(subgroup)));
+        }
+        bool const initialized = Py_IsInitialized();
+        TLLM_CHECK_WITH_INFO(initialized, "Trying to use ProcessGroup communicator but Python is not initialized");
+        try
+        {
+            c10::intrusive_ptr<c10d::ProcessGroup> pgSub;
+            {
+                pybind11::gil_scoped_acquire gil;
+                auto const m = pybind11::module::import("tensorrt_llm._torch.distributed.pg_utils");
+                // Properly box the existing intrusive_ptr ProcessGroup into an IValue
+                // and convert to a Python object without constructing a new instance.
+                auto const py_pg = torch::jit::toPyObject(c10::IValue(mPgComm));
+
+                auto const py_sub_pg = m.attr("split")(color, key, py_pg);
+                pgSub = torch::jit::toCustomClass<c10d::ProcessGroup>(py_sub_pg);
+            }
+            return CacheTransceiverComm(pgSub);
+        }
+        catch (...)
+        {
+            TLLM_THROW("Failed to split process group");
+        }
+    }
+
+private:
+    std::shared_ptr<mpi::MpiComm const> mMpiComm;
+    c10::intrusive_ptr<c10d::ProcessGroup> mPgComm;
+};
+
 class CacheTransceiverFactory
 {
 public:
@@ -124,9 +260,11 @@ class CacheTransceiver : public BaseCacheTransceiver
     std::unique_ptr<CacheReceiver> mCacheReceiver;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mSenderFutures;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
-    mpi::MpiComm const *mMpiGroupComm{nullptr}, *mMpiWorldComm{nullptr};
-    std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
-        mMpiGroupTPInDPComm;
+    mpi::MpiComm const* mMpiWorldComm{nullptr};
+
+    std::shared_ptr<CacheTransceiverComm> mGroupComm;
+    std::shared_ptr<CacheTransceiverComm> mGroupTensorParaComm, mGroupPipeParaComm, mGroupDataComm, mGroupTPInDPComm;
+
     executor::kv_cache::CommState const* mCommState;
     std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
     std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;

diff --git a/cpp/include/tensorrt_llm/common/bindingUtils.h b/cpp/include/tensorrt_llm/common/bindingUtils.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "c10/util/intrusive_ptr.h"
+#include <Python.h>
+
+namespace tensorrt_llm::common
+{
+
+// Adapted from pybind11's example implementation:
+// https://github.com/pybind/pybind11/blob/master/include/pybind11/conduit/pybind11_conduit_v1.h
+// Copyright (c) 2024 The pybind Community.
+
+inline void* get_raw_pointer_ephemeral(
+    PyObject* py_obj, std::type_info const* cpp_type_info, std::string const& pybind11_abi)
+{
+    PyObject* cpp_type_info_capsule = PyCapsule_New(
+        const_cast<void*>(static_cast<void const*>(cpp_type_info)), typeid(std::type_info).name(), nullptr);
+    if (cpp_type_info_capsule == nullptr)
+    {
+        return nullptr;
+    }
+    PyObject* cpp_conduit = PyObject_CallMethod(
+        py_obj, "_pybind11_conduit_v1_", "yOy", pybind11_abi.c_str(), cpp_type_info_capsule, "raw_pointer_ephemeral");
+    Py_DECREF(cpp_type_info_capsule);
+    if (cpp_conduit == nullptr)
+    {
+        return nullptr;
+    }
+    void* raw_ptr = PyCapsule_GetPointer(cpp_conduit, cpp_type_info->name());
+    Py_DECREF(cpp_conduit);
+    if (PyErr_Occurred())
+    {
+        return nullptr;
+    }
+    return raw_ptr;
+}
+
+template <typename T, typename E>
+T* get_type_pointer_ephemeral(PyObject* py_obj, std::string pybind11_abi)
+{
+    void* raw_ptr = get_raw_pointer_ephemeral(py_obj, &typeid(T), pybind11_abi);
+    if (raw_ptr == nullptr)
+    {
+        throw E();
+    }
+    return static_cast<T*>(raw_ptr);
+}
+
+template <typename T, typename E>
+c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_abi)
+{
+    auto* const p = get_type_pointer_ephemeral<T, E>(py_obj, pybind11_abi);
+    return c10::intrusive_ptr<T>::reclaim_copy(p);
+}
+
+} // namespace tensorrt_llm::common
diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h
@@ -295,7 +295,11 @@ struct CudaDataType<__nv_bfloat16>
 };
 #endif
 
-inline int getSMVersion()
+/// @brief Get the SM version of the current device.
+/// @param queryRealSmArch Whether to query the real SM architecture. example usage: use real sm arch when do LUT tuning
+/// and use fake sm arch when reuse sm120 code on sm121 devices.
+/// @return The SM version of the current device.
+inline int getSMVersion(bool queryRealSmArch = false)
 {
     int device{-1};
     check_cuda_error(cudaGetDevice(&device));
@@ -304,7 +308,7 @@ inline int getSMVersion()
     check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
     check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
     int sm = sm_major * 10 + sm_minor;
-    if (sm == 121)
+    if (sm == 121 && !queryRealSmArch)
     {
         return 120;
     }

diff --git a/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
@@ -35,6 +35,7 @@
 #include <cstdlib>
 #include <memory>
 #include <mutex>
+#include <optional>
 #include <thread>
 
 #if ENABLE_MULTI_DEVICE
@@ -425,7 +426,29 @@ class MpiComm
         return !(rhs == *this);
     }
 
+    bool couldUseMPI() const
+    {
+        if (!mDisableMPI.has_value())
+        {
+            char* val = std::getenv("TLLM_DISABLE_MPI");
+            if (val != NULL && std::string(val) == "1")
+            {
+                mDisableMPI = true;
+            }
+            else
+            {
+                mDisableMPI = false;
+            }
+        }
+        if (mDisableMPI.value())
+        {
+            throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
+        }
+        return true;
+    }
+
 private:
+    mutable std::optional<bool> mDisableMPI;
     //! \brief Corresponds to `world()` by default, but can be overridden per process.
     static MpiComm& mutableSession();