NVIDIA · QiJune · Oct 29, 2025 · Oct 27, 2025
@@ -6,7 +6,9 @@
 import torch
 import xgrammar
 
-from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
+from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
+
+from ...bindings.executor import GuidedDecodingParams
 
 
 class GrammarMatcher(ABC):

@@ -5,8 +5,10 @@
 
 import torch
 
+from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
+
 from ..._utils import nvtx_range
-from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
+from ...bindings.executor import GuidedDecodingParams
 from ...bindings.internal.batch_manager import LlmRequestType
 from ...logger import logger
 from ..hostfunc import hostfunc

@@ -14,9 +14,9 @@
 import tensorrt_llm
 from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
 from tensorrt_llm._utils import get_sm_version, mpi_disabled
-from tensorrt_llm.bindings.executor import GuidedDecodingConfig
 from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy,
-                                          ContextChunkingPolicy, LoadFormat,
+                                          ContextChunkingPolicy,
+                                          GuidedDecodingConfig, LoadFormat,
                                           TorchLlmArgs)
 from tensorrt_llm.llmapi.tokenizer import (TokenizerBase,
                                            _llguidance_tokenizer_info,

@@ -45,8 +45,7 @@
                                  KvCacheConfig as _KvCacheConfig,
                                  LookaheadDecodingConfig as _LookaheadDecodingConfig,
                                  PeftCacheConfig as _PeftCacheConfig,
-                                 SchedulerConfig as _SchedulerConfig,
-                                 GuidedDecodingConfig as _GuidedDecodingConfig) # isort: skip
+                                 SchedulerConfig as _SchedulerConfig) # isort: skip
 # isort: on
 
 # yapf: enable
@@ -165,6 +164,26 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
         return batch_sizes
 
 
+class GuidedDecodingConfig(StrictBaseModel):
+
+    class GuidedDecodingBackend(Enum):
+        XGRAMMAR = 0
+        LLGUIDANCE = 1
+
+    backend: GuidedDecodingBackend = Field(
+        default=GuidedDecodingBackend.XGRAMMAR,
+        description="The backend for guided decoding config.")
+    encoded_vocab: Optional[List[str]] = Field(
+        default=None,
+        description="The encoded vocab for guided decoding config.")
+    tokenizer_str: Optional[str] = Field(
+        default=None,
+        description="The tokenizer string for guided decoding config.")
+    stop_token_ids: Optional[List[int]] = Field(
+        default=None,
+        description="The stop token ids for guided decoding config.")
+
+
 class BaseSparseAttentionConfig(StrictBaseModel):
     """
     Configuration for sparse attention.