Fix the AR drop in non-cdl

ziyixiong-nv · ziyixiong-nv · commit 03d5bb7337e2 · 2025-10-30T21:04:40.000-07:00
Signed-off-by: ziyixiong-nv &lt;219238287+ziyixiong-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1476,7 +1476,15 @@ def _prepare_tp_inputs(
                     # skip adding input_ids of CUDA graph dummy requests so that new_tokens_device
                     # can be aligned to the correct positions.
                     if not request.is_cuda_graph_dummy:
-                        input_ids.append(request.get_last_tokens(beam))
+                        # Track position for GPU update (draft model only)
+                        if self.is_draft_model and num_accepted_tokens_device is not None:
+                            start_idx = len(input_ids)
+                            input_ids.append(request.get_last_tokens(beam))
+                            end_idx = len(input_ids)
+                            first_draft_input_ids_positions.append(
+                                (start_idx, end_idx, request.py_seq_slot))
+                        else:
+                            input_ids.append(request.get_last_tokens(beam))
                     past_seen_token_num = request.max_beam_num_tokens - 1
                 else:
                     # the request has previous tensor
@@ -1842,6 +1850,7 @@ def previous_seq_slots_device():
         self.iter_states['num_ctx_requests'] = num_ctx_requests
         self.iter_states['num_ctx_tokens'] = num_ctx_tokens
         self.iter_states['num_generation_tokens'] = num_generation_tokens
+        print(f"DEBUG: is_draft_model: {self.is_draft_model}, inputs: {inputs}")
         return inputs, self.gather_ids_cuda[:len(
             gather_ids)] if self.enable_spec_decode else None