chore: apply d2t in ModelDrafter, not TorchSampler

ixlmar · ixlmar · commit a1f626c27c96 · 2025-10-07T13:55:33.000+02:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -1055,9 +1055,6 @@ def _tree_sampling_batch(self, requests: list[LlmRequest],
                 seq_slots, top_k_list_cumsum[i] -
                 top_k_list_i:top_k_list_cumsum[i]] = indices[request_index]
 
-        # 5) Append eagle3 d2t.
-        self._apply_d2t(new_draft_tokens_cuda, model_outputs)
-
         # 6) Copy back to the output tensor.
         int_new_draft_tokens = new_draft_tokens_cuda.transpose(0, 1).to(
             torch.int, non_blocking=True).unsqueeze(dim=-1)
@@ -1206,16 +1203,6 @@ def sample_async(
                            host=SampleStateTensors(new_tokens=new_tokens_host),
                            sampler_event=sampler_event)
 
-    @staticmethod
-    def _apply_d2t(tokens: torch.Tensor, model_outputs) -> None:
-        """Applies draft-to-target token translation table.
-
-        Modifies tokens in-place.
-        """
-        if "d2t" in model_outputs:
-            d2t = model_outputs["d2t"][tokens]
-            tokens += d2t
-
     @staticmethod
     def _apply_embedding_bias(
         logits: torch.Tensor,
@@ -1332,16 +1319,6 @@ def _sample_batched_by_strategy(
             requests, pin_memory=True)
         generator_cuda = self.get_generator(cuda_device)
 
-        # FIXME: This check should/could be performed in ModelDrafter.prepare_draft_tokens
-        #
-        # NB: Currently, "d2t" is applied to draft tokens, but not to draft logits,
-        #     breaking _process_draft_tokens_rejection_sampling.
-        needs_d2t = "d2t" in model_outputs
-        if needs_d2t and (len(requests_by_strategy) > 1 or
-                          (requests_by_strategy
-                           and next(iter(requests_by_strategy)) != GREEDY)):
-            raise ValueError("d2t does not yet support non-greedy sampling")
-
         # Indexer for accessing tokens in 'logits_cuda', corresponding to the
         # requests in 'requests'.
         logits_cuda_indexer = _PackedStepIndexer(
@@ -1459,17 +1436,6 @@ def _sample_batched_by_strategy(
             (batch_req_indices, batch_next_tokens_cuda_int,
              batch_softmax_cuda), = batched_results
 
-        # FIXME: This should be done in ModelDrafter.prepare_draft_tokens, but for performance
-        #        parity py_draft_tokens might need to be replaced / backed by a torch.Tensor, so
-        #        that d2t can be applied in a batched manner similar to the code below.
-        if needs_d2t:
-            # NB: The sampler is either called directly by PyExecutor, for the target model,
-            #     or by ModelDrafter.prepare_draft_tokens(), for the draft model. In the former
-            #     case there are 1 + get_draft_token_length(request) tokens per request. In the
-            #     latter case, only there is always only 1 token per request because draft
-            #     tokens are sampled one-by-one.
-            self._apply_d2t(batch_next_tokens_cuda_int, model_outputs)
-
         return _BatchedSamplingResult(
             batch_req_indices=batch_req_indices,
             batch_next_tokens_cuda_int=batch_next_tokens_cuda_int,
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import traceback
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, cast
 
 import torch
 
@@ -73,14 +73,30 @@ def __init__(
         self.guided_decoder = guided_decoder
 
         self.use_static_draft_loop = draft_model_engine.model_is_wrapped
+        self.d2t: Optional[torch.Tensor] = None
         if self.use_static_draft_loop:
             # TODO: enable sampling/guided decoding on static draft loop
             assert guided_decoder is None
             assert spec_config._allow_greedy_draft_tokens
+        else:
+            # Handle d2t data if available. Static drafting loops should incorporate d2t
+            # in their implementations.
+            if hasattr(self.draft_model_engine.model.model, "d2t"):
+                self.d2t = self.draft_model_engine.model.model.d2t.data
+        self.d2t_host: Optional[torch.Tensor] = None
+        if self.d2t is not None:
+            self.d2t_host = self.d2t.to(device="cpu")
 
     def _create_draft_request(self, request: LlmRequest,
                               input_tokens: Optional[List]) -> LlmRequest:
         """Create a draft request with common parameters."""
+        needs_probs = self.sampler.should_provide_draft_probs(request)
+
+        # NB: Currently, "d2t" is applied to draft tokens, but not to draft logits,
+        #     breaking _process_draft_tokens_rejection_sampling.
+        if self.d2t is not None and needs_probs:
+            raise ValueError("d2t does not yet support non-greedy sampling")
+
         return LlmRequest(
             input_tokens=input_tokens,
             request_id=request.py_request_id,
@@ -94,8 +110,7 @@ def _create_draft_request(self, request: LlmRequest,
             True,  # prepare_draft_tokens uses overlap scheduling
             is_draft=True,
             # NB: self.sampler is shared with PyExecutor
-            return_generation_logits=self.sampler.should_provide_draft_probs(
-                request))
+            return_generation_logits=needs_probs)
 
     def _initialize_draft_tokens(self, request: LlmRequest) -> Tuple[int, int]:
         """Initialize draft token tracking for a request."""
@@ -301,12 +316,6 @@ def forward_draft_model(
                 resource_manager,
                 new_tensors_device=previous_tensors)
 
-        # Handle d2t data if available. Static drafting loops should incorporate d2t
-        # in their implementations.
-        if not self.use_static_draft_loop and hasattr(
-                self.draft_model_engine.model.model, 'd2t'):
-            outputs['d2t'] = self.draft_model_engine.model.model.d2t.data
-
         return outputs
 
     def sample_async(
@@ -365,6 +374,7 @@ def update_requests(
         """Update requests with sample state."""
         self.sampler.update_requests(sample_state, resource_manager)
 
+    @torch.inference_mode()
     def process_decoded_tokens(
             self, draft_batch: ScheduledRequests,
             req_id_to_old_request: Dict[int, LlmRequest]) -> List[LlmRequest]:
@@ -378,7 +388,16 @@ def process_decoded_tokens(
                 self.draft_seq_slot_manager.free_resources(req)
                 continue
 
-            target_model_req.py_draft_tokens.append(req.get_last_tokens(0))
+            draft_tokens: List[int] = req.get_last_tokens(0)
+            if self.d2t_host is not None:
+                # NB: This is not batched over requests, but considered acceptable given
+                #     that the code already loops over requests and there are few draft
+                #     tokens per request.
+                draft_tokens = [
+                    tok + cast(int, self.d2t_host[tok].item())
+                    for tok in draft_tokens
+                ]
+            target_model_req.py_draft_tokens.append(draft_tokens)
             target_model_req.py_draft_logits = req.py_result.generation_logits  # forwards Nones
             if req.state != LlmRequestState.GENERATION_COMPLETE and len(
                     target_model_req.py_draft_tokens
@@ -591,8 +610,7 @@ def _execute_draft_iteration(
 
         if self.guided_decoder is not None:
             self.guided_decoder.add_batch(draft_batch)
-            self.guided_decoder.execute(outputs['logits'],
-                                        d2t=outputs.get('d2t'))
+            self.guided_decoder.execute(outputs['logits'], d2t=self.d2t)
 
         sample_state = self.sample_async(draft_batch, outputs, resource_manager)
         self.update_request_states(draft_batch)
@@ -726,8 +744,7 @@ def generate_draft_tokens_with_overlap(
         # Handle guided decoder and sampling for non-static loop
         if self.guided_decoder is not None:
             self.guided_decoder.add_batch(draft_batch)
-            self.guided_decoder.execute(outputs['logits'],
-                                        d2t=outputs.get('d2t'))
+            self.guided_decoder.execute(outputs['logits'], d2t=self.d2t)
         draft_sample_state = self.sample_async(draft_batch, outputs,
                                                resource_manager)
 
@@ -791,8 +808,7 @@ def prepare_draft_tokens(
 
             if self.guided_decoder is not None:
                 self.guided_decoder.add_batch(draft_batch)
-                self.guided_decoder.execute(outputs['logits'],
-                                            d2t=outputs.get('d2t'))
+                self.guided_decoder.execute(outputs['logits'], d2t=self.d2t)
             sample_state = self.sample_async(draft_batch, outputs,
                                              resource_manager)
             self.update_request_states(draft_batch)
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
@@ -159,7 +159,6 @@ def get_spec_decoder(sampler_args: TorchSampler.Args,
                           nextn=spec_config.num_nextn_predict_layers)
     if spec_config.spec_dec_mode.is_eagle3(
     ) or spec_config.spec_dec_mode.is_mtp_eagle():
-        # TorchSampler handles Eagle3 gracefully, by integrating d2t into the sampling process
         return TorchSampler(sampler_args)
     if spec_config.spec_dec_mode.is_eagle3_one_model():
         return Eagle3OneModelSampler(sampler_args)