[TRTLLM-8763][chore] Deprecate pybind based GuidedDecodingConfig usage in torch backend (#8717)

leslie-fang25 · web-flow · commit 451959c60d56 · 2025-10-29T20:37:14.000+08:00
Signed-off-by: leslie-fang25 &lt;leslief@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
@@ -6,7 +6,9 @@
 import torch
 import xgrammar
 
-from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
+from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
+
+from ...bindings.executor import GuidedDecodingParams
 
 
 class GrammarMatcher(ABC):
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -5,8 +5,10 @@
 
 import torch
 
+from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
+
 from ..._utils import nvtx_range
-from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
+from ...bindings.executor import GuidedDecodingParams
 from ...bindings.internal.batch_manager import LlmRequestType
 from ...logger import logger
 from ..hostfunc import hostfunc
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -14,9 +14,9 @@
 import tensorrt_llm
 from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
 from tensorrt_llm._utils import get_sm_version, mpi_disabled
-from tensorrt_llm.bindings.executor import GuidedDecodingConfig
 from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy,
-                                          ContextChunkingPolicy, LoadFormat,
+                                          ContextChunkingPolicy,
+                                          GuidedDecodingConfig, LoadFormat,
                                           TorchLlmArgs)
 from tensorrt_llm.llmapi.tokenizer import (TokenizerBase,
                                            _llguidance_tokenizer_info,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -44,8 +44,7 @@
                                  KvCacheConfig as _KvCacheConfig,
                                  LookaheadDecodingConfig as _LookaheadDecodingConfig,
                                  PeftCacheConfig as _PeftCacheConfig,
-                                 SchedulerConfig as _SchedulerConfig,
-                                 GuidedDecodingConfig as _GuidedDecodingConfig) # isort: skip
+                                 SchedulerConfig as _SchedulerConfig) # isort: skip
 # isort: on
 
 # yapf: enable
@@ -164,6 +163,26 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
         return batch_sizes
 
 
+class GuidedDecodingConfig(StrictBaseModel):
+
+    class GuidedDecodingBackend(Enum):
+        XGRAMMAR = 0
+        LLGUIDANCE = 1
+
+    backend: GuidedDecodingBackend = Field(
+        default=GuidedDecodingBackend.XGRAMMAR,
+        description="The backend for guided decoding config.")
+    encoded_vocab: Optional[List[str]] = Field(
+        default=None,
+        description="The encoded vocab for guided decoding config.")
+    tokenizer_str: Optional[str] = Field(
+        default=None,
+        description="The tokenizer string for guided decoding config.")
+    stop_token_ids: Optional[List[int]] = Field(
+        default=None,
+        description="The stop token ids for guided decoding config.")
+
+
 class BaseSparseAttentionConfig(StrictBaseModel):
     """
     Configuration for sparse attention.