From 6cbd83286dd166f16a3d9400d3d6416fe5963cfe Mon Sep 17 00:00:00 2001
From: leslie-fang25 <leslief@nvidia.com>
Date: Sun, 26 Oct 2025 23:31:26 -0700
Subject: [PATCH] [TRTLLM-8763][chore] Deprecate pybind based
 GuidedDecodingConfig usage in torch backend

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
---
 .../_torch/pyexecutor/grammar_matcher.py      |  4 +++-
 .../_torch/pyexecutor/guided_decoder.py       |  4 +++-
 .../_torch/pyexecutor/py_executor_creator.py  |  4 ++--
 tensorrt_llm/llmapi/llm_args.py               | 23 +++++++++++++++++--
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
index 9688b4e0243..3de8d6f2b40 100644
--- a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
+++ b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
@@ -6,7 +6,9 @@
 import torch
 import xgrammar
 
-from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
+from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
+
+from ...bindings.executor import GuidedDecodingParams
 
 
 class GrammarMatcher(ABC):
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
index d280acf4c09..efd3379ee09 100644
--- a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
+++ b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -5,8 +5,10 @@
 
 import torch
 
+from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
+
 from ..._utils import nvtx_range
-from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
+from ...bindings.executor import GuidedDecodingParams
 from ...bindings.internal.batch_manager import LlmRequestType
 from ...logger import logger
 from ..hostfunc import hostfunc
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index c57b9d3c6fa..0936bd5dc98 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -14,9 +14,9 @@
 import tensorrt_llm
 from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
 from tensorrt_llm._utils import get_sm_version, mpi_disabled
-from tensorrt_llm.bindings.executor import GuidedDecodingConfig
 from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy,
-                                          ContextChunkingPolicy, LoadFormat,
+                                          ContextChunkingPolicy,
+                                          GuidedDecodingConfig, LoadFormat,
                                           TorchLlmArgs)
 from tensorrt_llm.llmapi.tokenizer import (TokenizerBase,
                                            _llguidance_tokenizer_info,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index b537a4702be..cd2967d5efc 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -45,8 +45,7 @@
                                  KvCacheConfig as _KvCacheConfig,
                                  LookaheadDecodingConfig as _LookaheadDecodingConfig,
                                  PeftCacheConfig as _PeftCacheConfig,
-                                 SchedulerConfig as _SchedulerConfig,
-                                 GuidedDecodingConfig as _GuidedDecodingConfig) # isort: skip
+                                 SchedulerConfig as _SchedulerConfig) # isort: skip
 # isort: on
 
 # yapf: enable
@@ -165,6 +164,26 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
         return batch_sizes
 
 
+class GuidedDecodingConfig(StrictBaseModel):
+
+    class GuidedDecodingBackend(Enum):
+        XGRAMMAR = 0
+        LLGUIDANCE = 1
+
+    backend: GuidedDecodingBackend = Field(
+        default=GuidedDecodingBackend.XGRAMMAR,
+        description="The backend for guided decoding config.")
+    encoded_vocab: Optional[List[str]] = Field(
+        default=None,
+        description="The encoded vocab for guided decoding config.")
+    tokenizer_str: Optional[str] = Field(
+        default=None,
+        description="The tokenizer string for guided decoding config.")
+    stop_token_ids: Optional[List[int]] = Field(
+        default=None,
+        description="The stop token ids for guided decoding config.")
+
+
 class BaseSparseAttentionConfig(StrictBaseModel):
     """
     Configuration for sparse attention.