Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
110 commits
Select commit Hold shift + click to select a range
604d704
[https://nvbugs/5542867][fix] Fix the non-determinism issue in the mm…
chang-l Sep 29, 2025
08c8440
add guide for qwen3 next
faradawn Sep 25, 2025
4e9fc75
add disable kv cache reuse
faradawn Sep 28, 2025
ca23956
add benchmarking results
faradawn Sep 29, 2025
f477c15
fix the spelling of TensorRT LLM
faradawn Sep 30, 2025
2e55ed9
change ngc container version to 1.1.0rc7
faradawn Sep 30, 2025
e934e30
add docker build command
faradawn Sep 30, 2025
93574da
add build and run commands
faradawn Sep 30, 2025
c6279e7
add note that more features and optimizations will be released
faradawn Sep 30, 2025
cecf41b
remove mentioning of best performance
faradawn Sep 30, 2025
03a5e40
remove benchmarking result
faradawn Sep 30, 2025
f268afc
add blackwell not supported
faradawn Sep 30, 2025
72de68f
restore unit tests
faradawn Sep 30, 2025
6f53934
add to model feature support matrix
faradawn Sep 30, 2025
a115dcf
change parameters to match CI test in #8111
faradawn Oct 1, 2025
ee3e064
[https://nvbugs/5538098][fix] Checking connection to etcd server in u…
pcastonguay Sep 30, 2025
854cf48
[TRTLLM-6741][fix] Add heuristics for lm head tp size when `enable_lm…
Njuapp Sep 30, 2025
31484e0
[None][feat] Return topk logprobs in torch backend (#7976)
dcaox Sep 30, 2025
2c243ad
[#4593][feat] AutoDeploy: Linear Attention Support (SSM + causal_conv…
lucaslie Sep 30, 2025
2474314
[None] [test] Add MNNVL AlltoAll tests to pre-merge (#7466)
kaiyux Sep 30, 2025
8c31bf8
[TRTLLM-6239][feat] add test cases into QA test list (#8081)
xinhe-nv Sep 30, 2025
8d5be98
[None][fix] Fix CUDA graph for Qwen2.5-VL (#8047)
yechank-nvidia Sep 30, 2025
3b5c92b
[None][chore] Bump version to 1.2.0rc1 (#8097)
yiqingy0 Sep 30, 2025
a34c223
[https://nvbugs/5547414][fix] avoid downloading Tiny llama from HF (#…
Tabrizian Sep 30, 2025
b0350c5
[None][chore] Refine qwen3-next implementation. (#8064)
nv-guomingz Sep 30, 2025
1b3c948
[TRTLLM-8269][fix] Revert "do not explicitly pass temperature=0 to se…
ixlmar Sep 30, 2025
46b931c
[None][chore] Waive failing MNNVL alltoall multi-gpu test (#8106)
brb-nv Oct 1, 2025
6b74bc5
[None][fix] : Fix OOM issue when dp padding is enabled (#8052)
peaceh-nv Oct 1, 2025
326ff73
[None][doc] Add more description on EXAONE usage (#8089)
yechank-nvidia Oct 1, 2025
b86628b
[None][infra] Skip failed tests in post-merge for main (#8102)
EmmaQiaoCh Oct 1, 2025
be32c5b
[https://nvbugs/5434320][fix] fix: Unwaiving disagg pp tests (#8069)
pcastonguay Oct 1, 2025
1cd6938
[OMNIML-2336][feat] add W4A8 NVFP4 FP8 fused moe (#7968)
sychen52 Oct 1, 2025
a4b702f
[TRTLLM-6342][bug] Fix shape propagation after TP sharding (#7912)
greg-kwasniewski1 Oct 1, 2025
f24df4f
[TRTLLM-8031][feat] Add chunked return_generation_logits logic (#7831)
yibinl-nvidia Oct 1, 2025
79b53d5
[#5860][feat] Add ModelOPT INT4 awq fake quant support in AutoDeploy …
Fridah-nv Oct 1, 2025
da2f7c1
[None][fix] fix patchelf version issue (#8112)
bo-nv Oct 1, 2025
8a9dc7d
[None][feat] Draft: Save state first pass (#7012)
IzzyPutterman Oct 1, 2025
e4b0fe7
[TRTLLM-7733][feat] Executor changes to support helix parallelism (#7…
brb-nv Oct 2, 2025
947d6b9
[https://nvbugs/5549081][fix] Fix device id assignment for some visio…
chang-l Oct 2, 2025
ed85512
[#7588][feat] lock gpu clocks in test_perf.py to reliably detect perf…
MrGeva Oct 2, 2025
de61541
[TRTLLM-8269][test] do not explicitly pass temperature=0 to select gr…
ixlmar Oct 2, 2025
604b708
add benchmarking results
faradawn Sep 29, 2025
7681dbc
remove benchmarking result
faradawn Sep 30, 2025
3a1628c
Revert "restore unit tests"
Funatiq Oct 2, 2025
2be58ea
[None][test] Add accuracy test for Qwen3Next model
Funatiq Oct 1, 2025
8e9e8d5
[None] [test] Add MNNVL AlltoAll tests to pre-merge (#7466)
kaiyux Sep 30, 2025
b831f18
[TRTLLM-8269][fix] Revert "do not explicitly pass temperature=0 to se…
ixlmar Sep 30, 2025
634e739
[None][chore] Waive failing MNNVL alltoall multi-gpu test (#8106)
brb-nv Oct 1, 2025
c70a806
[TRTLLM-8269][test] do not explicitly pass temperature=0 to select gr…
ixlmar Oct 2, 2025
3b3f280
[https://nvbugs/5556020][chore] waive test_eagle3 (#8119)
hchings Oct 2, 2025
1349e2c
[TRTLLM-6589][feat] Support CUDA graph for DeepEP (#7514)
yifeizhang-c Oct 2, 2025
acd6dbf
[TRTLLM-7775][feat] Integrate tinygemm2 for gpt-oss (#7916)
dongfengy Oct 2, 2025
a3d83e8
[None][feat] Support for cancelling requests with disaggregation (#8114)
pcastonguay Oct 2, 2025
3247ff0
[None][fix] Fix access to new tokens in sampler. (#7958)
dcampora Oct 2, 2025
a444e00
[None][chore] Adding install_tensorrt.sh script to pip wheel (#8116)
pcastonguay Oct 2, 2025
b0d9fa8
[#7588][fix] fixed the kv cache size parsing in test_perf.py AD backe…
MrGeva Oct 2, 2025
063e969
[TRTLLM-6342][bug] Patched incorrect starcoder tp config (#8118)
greg-kwasniewski1 Oct 2, 2025
1911489
[None][feat] perf_metrics endpoint functionality improvement (#8005)
nv-yilinf Oct 3, 2025
381e08f
[None][feat] Update TRT-LLM Gen MoE kernels (#7970)
nekorobov Oct 3, 2025
5b28fa2
[https://nvbugs/5548098][fix] Fix flakey unit test for dynamic spec d…
hchings Oct 3, 2025
dddffd3
restore unit tests
faradawn Sep 30, 2025
8ad0381
[None] [test] Add MNNVL AlltoAll tests to pre-merge (#7466)
kaiyux Sep 30, 2025
3b4f5a9
[TRTLLM-8269][fix] Revert "do not explicitly pass temperature=0 to se…
ixlmar Sep 30, 2025
a4ddcc0
[None][chore] Waive failing MNNVL alltoall multi-gpu test (#8106)
brb-nv Oct 1, 2025
76ef41a
[TRTLLM-8269][test] do not explicitly pass temperature=0 to select gr…
ixlmar Oct 2, 2025
d9356c8
add benchmarking results
faradawn Sep 29, 2025
50eda7a
remove benchmarking result
faradawn Sep 30, 2025
e257e6d
Revert "restore unit tests"
Funatiq Oct 2, 2025
ca82911
[None][fix] Fix MTP 2-model (#8115)
mikeiovine Oct 3, 2025
38da871
[TRTLLM-6496][feat] Add LoRa Torch tests for the latest NIM model lis…
moraxu Oct 3, 2025
2c454e8
[None][feat] AutoDeploy: Nemotron-H accuracy test (#8133)
lucaslie Oct 3, 2025
9d098e3
[None][feat] AutoDeploy: graph/module inputs with kwargs instead of a…
lucaslie Oct 3, 2025
88ea2c4
[TRTLLM-7349][feat] Adding new orchestrator type -- ray (#7520)
joyang-nv Oct 4, 2025
744246d
[None][autodeploy] small refactors on attention matching (#8079)
Fridah-nv Oct 4, 2025
f6654f2
[#5255][autodeploy] Update FuseAllreduceResidualRMSNorm to use patter…
Fridah-nv Oct 5, 2025
fb51de6
[TRTLLM-8189][chore] enhance GenerationExecutor with RPC (part1) (#5543)
Superjomn Oct 5, 2025
8060aad
[https://nvbugs/5521949][fix] Re-enable test_bielik_11b_v2_2_instruct…
amitz-nv Oct 5, 2025
fba351a
[None][fix] Adding docker folder to Dockerfile (#8138)
pcastonguay Oct 5, 2025
54ab976
[None][chore] fix llmargs conflict (#8152)
Superjomn Oct 6, 2025
98b3af4
[TRTLLM-8413][chore] resolve sampling defaults in OpenAI API backend …
ixlmar Oct 6, 2025
3492391
[None][chore] AutoDeploy: clean up accuracy test configs (#8134)
lucaslie Oct 6, 2025
f2657c1
[None][fix] Eagle: Attention DP (#7939)
IzzyPutterman Oct 6, 2025
27a5091
[None][feat] GPT-OSS Sm120/Sm121 Support (#7937)
farazkh80 Oct 6, 2025
2b8722b
[None][chore] Increase operations-per-run to 1000 for stale action (#…
karljang Oct 6, 2025
9298f1b
[None] [test] Add B300 cases to CI (#8056)
VALLIS-NERIA Oct 7, 2025
ca9da1f
[None][infra] Skip failed cases for main (#8176)
EmmaQiaoCh Oct 7, 2025
7facac0
[None][fix] Fix MTP illegal memory access (#8161)
mikeiovine Oct 7, 2025
0017969
add guide for qwen3 next
faradawn Sep 25, 2025
38317e8
add disable kv cache reuse
faradawn Sep 28, 2025
edfa754
add benchmarking results
faradawn Sep 29, 2025
4b596d7
fix the spelling of TensorRT LLM
faradawn Sep 30, 2025
398cda4
change ngc container version to 1.1.0rc7
faradawn Sep 30, 2025
4d11895
add docker build command
faradawn Sep 30, 2025
91a313c
add build and run commands
faradawn Sep 30, 2025
3683050
add note that more features and optimizations will be released
faradawn Sep 30, 2025
67d8548
remove mentioning of best performance
faradawn Sep 30, 2025
e53bd19
remove benchmarking result
faradawn Sep 30, 2025
0177e70
add blackwell not supported
faradawn Sep 30, 2025
4317b5b
restore unit tests
faradawn Sep 30, 2025
5d66cac
add to model feature support matrix
faradawn Sep 30, 2025
3fa9583
change parameters to match CI test in #8111
faradawn Oct 1, 2025
2a99df8
add benchmarking results
faradawn Sep 29, 2025
5887d91
remove benchmarking result
faradawn Sep 30, 2025
9d77fe4
Revert "restore unit tests"
Funatiq Oct 2, 2025
aa1b6fb
[None][test] Add accuracy test for Qwen3Next model
Funatiq Oct 1, 2025
ebf738b
restore unit tests
faradawn Sep 30, 2025
730422e
add benchmarking results
faradawn Sep 29, 2025
d4542c9
remove benchmarking result
faradawn Sep 30, 2025
16055ee
Revert "restore unit tests"
Funatiq Oct 2, 2025
811c477
Merge branch qwen3-next-guide of github.com:faradawn/TensorRT-LLM int…
faradawn Oct 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/auto-close-inactive-issues.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ jobs:
labels-to-remove-when-unstale: 'stale,waiting for feedback'
stale-issue-label: 'stale'
stale-pr-label: 'stale'
operations-per-run: 1000
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ tensorrt_llm/deep_ep_cpp_tllm.pyi
tensorrt_llm/deep_gemm/
tensorrt_llm/deep_gemm_cpp_tllm.*.so
tensorrt_llm/deep_gemm_cpp_tllm.pyi
tensorrt_llm/pg_utils_bindings.*.so
*docs/cpp_docs*
*docs/source/_cpp_gen*
docs/source/**/*.rst
Expand Down
146 changes: 142 additions & 4 deletions cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,17 @@
#include "tensorrt_llm/executor/cacheCommunicator.h"
#include "tensorrt_llm/executor/dataTransceiverState.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
#include "tensorrt_llm/runtime/utils/pgUtils.h"
#include <future>
#include <map>
#include <memory>
#include <mutex>
#include <optional>
#include <pybind11/pybind11.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <torch/custom_class.h>
#include <torch/python.h>
#include <type_traits>
#include <vector>

using SizeType32 = tensorrt_llm::runtime::SizeType32;

Expand All @@ -43,6 +51,134 @@ class BaseKVCacheManager;
class CacheSender;
class CacheReceiver;

class CacheTransceiverComm
{
public:
// Construct from a non-owning raw pointer, won't take ownership of the pointer
explicit CacheTransceiverComm(mpi::MpiComm const* mpiComm)
: mMpiComm(std::shared_ptr<mpi::MpiComm const>(nullptr), mpiComm)
{
}

// Construct from a shared_ptr with shared ownership
explicit CacheTransceiverComm(std::shared_ptr<mpi::MpiComm const> mpiComm)
: mMpiComm(std::move(mpiComm))
{
}

// Construct from a ProcessGroup communicator
explicit CacheTransceiverComm(c10::intrusive_ptr<c10d::ProcessGroup> pgComm)
: mPgComm(std::move(pgComm))
{
}

~CacheTransceiverComm() = default;

bool isMpi() const noexcept
{
return mMpiComm != nullptr;
}

int getRank() const
{
if (isMpi())
{
return mMpiComm->getRank();
}
return mPgComm->getRank();
}

int getSize() const
{
if (isMpi())
{
return mMpiComm->getSize();
}
return mPgComm->getSize();
}

void allgather(void const* sendbuf, void* recvbuf, int count, mpi::MpiType dtype) const
{
if (isMpi())
{
mMpiComm->allgather(sendbuf, recvbuf, count, dtype);
return;
}
TLLM_THROW("Input arguments only supported in mpi");
}

template <typename Input, typename Output>
bool allgather(Input input, Output output, c10d::AllgatherOptions options = c10d::AllgatherOptions()) const
{
if (isMpi())
{
TLLM_THROW("Input arguments only supported in pg");
}
tensorrt_llm::pg_utils::PgHelper pgh{mPgComm};

PGCHECK_THROW(pgh.allgather(input, output, options));
return true;
}

template <typename Input, typename Output>
bool allgatherv(Input input, Output output, std::vector<int> const& sizes,
c10d::AllgatherOptions options = c10d::AllgatherOptions()) const
{
if (isMpi())
{
TLLM_THROW("Input arguments only supported in pg");
}
tensorrt_llm::pg_utils::PgHelper pgh{mPgComm};
PGCHECK_THROW(pgh.allgatherv(input, output, sizes, options));
return true;
}

bool allgatherv(void const* sendbuf, int sendcount, mpi::MpiType sendtype, void* recvbuf,
std::vector<int> const& recvcounts, std::vector<int> const& displs, mpi::MpiType recvtype) const
{
if (isMpi())
{
mMpiComm->allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype);
return true;
}
TLLM_THROW("Input arguments only supported in mpi");
}

CacheTransceiverComm split(int color, int key)
{
if (isMpi())
{
auto subgroup = mMpiComm->split(color, key);
return CacheTransceiverComm(std::make_shared<mpi::MpiComm const>(std::move(subgroup)));
}
bool const initialized = Py_IsInitialized();
TLLM_CHECK_WITH_INFO(initialized, "Trying to use ProcessGroup communicator but Python is not initialized");
try
{
c10::intrusive_ptr<c10d::ProcessGroup> pgSub;
{
pybind11::gil_scoped_acquire gil;
auto const m = pybind11::module::import("tensorrt_llm._torch.distributed.pg_utils");
// Properly box the existing intrusive_ptr ProcessGroup into an IValue
// and convert to a Python object without constructing a new instance.
auto const py_pg = torch::jit::toPyObject(c10::IValue(mPgComm));

auto const py_sub_pg = m.attr("split")(color, key, py_pg);
pgSub = torch::jit::toCustomClass<c10d::ProcessGroup>(py_sub_pg);
}
return CacheTransceiverComm(pgSub);
}
catch (...)
{
TLLM_THROW("Failed to split process group");
}
}

private:
std::shared_ptr<mpi::MpiComm const> mMpiComm;
c10::intrusive_ptr<c10d::ProcessGroup> mPgComm;
};

class CacheTransceiverFactory
{
public:
Expand Down Expand Up @@ -124,9 +260,11 @@ class CacheTransceiver : public BaseCacheTransceiver
std::unique_ptr<CacheReceiver> mCacheReceiver;
std::vector<std::pair<LlmRequest*, std::future<void>>> mSenderFutures;
std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
mpi::MpiComm const *mMpiGroupComm{nullptr}, *mMpiWorldComm{nullptr};
std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
mMpiGroupTPInDPComm;
mpi::MpiComm const* mMpiWorldComm{nullptr};

std::shared_ptr<CacheTransceiverComm> mGroupComm;
std::shared_ptr<CacheTransceiverComm> mGroupTensorParaComm, mGroupPipeParaComm, mGroupDataComm, mGroupTPInDPComm;

executor::kv_cache::CommState const* mCommState;
std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;
Expand Down
72 changes: 72 additions & 0 deletions cpp/include/tensorrt_llm/common/bindingUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "c10/util/intrusive_ptr.h"
#include <Python.h>

namespace tensorrt_llm::common
{

// Adapted from pybind11's example implementation:
// https://github.com/pybind/pybind11/blob/master/include/pybind11/conduit/pybind11_conduit_v1.h
// Copyright (c) 2024 The pybind Community.

inline void* get_raw_pointer_ephemeral(
PyObject* py_obj, std::type_info const* cpp_type_info, std::string const& pybind11_abi)
{
PyObject* cpp_type_info_capsule = PyCapsule_New(
const_cast<void*>(static_cast<void const*>(cpp_type_info)), typeid(std::type_info).name(), nullptr);
if (cpp_type_info_capsule == nullptr)
{
return nullptr;
}
PyObject* cpp_conduit = PyObject_CallMethod(
py_obj, "_pybind11_conduit_v1_", "yOy", pybind11_abi.c_str(), cpp_type_info_capsule, "raw_pointer_ephemeral");
Py_DECREF(cpp_type_info_capsule);
if (cpp_conduit == nullptr)
{
return nullptr;
}
void* raw_ptr = PyCapsule_GetPointer(cpp_conduit, cpp_type_info->name());
Py_DECREF(cpp_conduit);
if (PyErr_Occurred())
{
return nullptr;
}
return raw_ptr;
}

template <typename T, typename E>
T* get_type_pointer_ephemeral(PyObject* py_obj, std::string pybind11_abi)
{
void* raw_ptr = get_raw_pointer_ephemeral(py_obj, &typeid(T), pybind11_abi);
if (raw_ptr == nullptr)
{
throw E();
}
return static_cast<T*>(raw_ptr);
}

template <typename T, typename E>
c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_abi)
{
auto* const p = get_type_pointer_ephemeral<T, E>(py_obj, pybind11_abi);
return c10::intrusive_ptr<T>::reclaim_copy(p);
}

} // namespace tensorrt_llm::common
8 changes: 6 additions & 2 deletions cpp/include/tensorrt_llm/common/cudaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,11 @@ struct CudaDataType<__nv_bfloat16>
};
#endif

inline int getSMVersion()
/// @brief Get the SM version of the current device.
/// @param queryRealSmArch Whether to query the real SM architecture. example usage: use real sm arch when do LUT tuning
/// and use fake sm arch when reuse sm120 code on sm121 devices.
/// @return The SM version of the current device.
inline int getSMVersion(bool queryRealSmArch = false)
{
int device{-1};
check_cuda_error(cudaGetDevice(&device));
Expand All @@ -304,7 +308,7 @@ inline int getSMVersion()
check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
int sm = sm_major * 10 + sm_minor;
if (sm == 121)
if (sm == 121 && !queryRealSmArch)
{
return 120;
}
Expand Down
23 changes: 23 additions & 0 deletions cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <cstdlib>
#include <memory>
#include <mutex>
#include <optional>
#include <thread>

#if ENABLE_MULTI_DEVICE
Expand Down Expand Up @@ -425,7 +426,29 @@ class MpiComm
return !(rhs == *this);
}

bool couldUseMPI() const
{
if (!mDisableMPI.has_value())
{
char* val = std::getenv("TLLM_DISABLE_MPI");
if (val != NULL && std::string(val) == "1")
{
mDisableMPI = true;
}
else
{
mDisableMPI = false;
}
}
if (mDisableMPI.value())
{
throw std::runtime_error("MPI is disabled, DON\'T USE MPI");
}
return true;
}

private:
mutable std::optional<bool> mDisableMPI;
//! \brief Corresponds to `world()` by default, but can be overridden per process.
static MpiComm& mutableSession();

Expand Down
Loading