[None][feat] Make 2-model spec dec use the 1-model kernels (Hopper)

mikeiovine · mikeiovine · commit 2e002162d75f · 2025-10-30T12:42:16.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -5,7 +5,6 @@
 
 import torch
 
-from ..._utils import get_sm_version
 from ..attention_backend.trtllm import AttentionBackend, TrtllmAttention
 from ..pyexecutor.resource_manager import BaseResourceManager
 
@@ -117,13 +116,7 @@ def extend_ctx(self, attention_backend: Type[AttentionBackend]):
             # 1-model has separate logic for handling draft tokens
             return False
 
-        if issubclass(attention_backend,
-                      TrtllmAttention) and self.is_mtp_eagle():
-            # TRTLLM MLA does not work with the chunked context mode.
-            return False
-
-        return not issubclass(attention_backend,
-                              TrtllmAttention) or get_sm_version() != 100
+        return not issubclass(attention_backend, TrtllmAttention)
 
     def attention_need_spec_dec_mode(
         self,
@@ -137,9 +130,8 @@ def attention_need_spec_dec_mode(
         If true, the attention backend kernel needs to run in spec-dec mode (multi-token query mode).
         """
         is_trtllm_attention = issubclass(attention_backend, TrtllmAttention)
-        return self.is_eagle3_one_model() or (
-            self.is_eagle3() and spec_resource_manager.is_first_draft
-            and is_trtllm_attention and use_chain_drafter and is_draft_model)
+        return self.is_eagle3_one_model() or not is_draft_model or (
+            spec_resource_manager.is_first_draft and is_trtllm_attention)
 
     @staticmethod
     def from_string(name: Optional[str]) -> "SpeculativeDecodingMode":
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -51,11 +51,11 @@ def enforce_single_worker(monkeypatch):
         [False, "FLASHINFER", False, False, False, False, True, False, False],
     ])
 @pytest.mark.high_cuda_memory
-def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
-                      disable_overlap_scheduler: bool, enable_block_reuse: bool,
-                      use_one_model: bool, enable_chunked_prefill: bool,
-                      use_chain_drafter: bool, multi_batch: bool,
-                      attention_dp: bool, request):
+def test_foo(use_cuda_graph: bool, attn_backend: str,
+             disable_overlap_scheduler: bool, enable_block_reuse: bool,
+             use_one_model: bool, enable_chunked_prefill: bool,
+             use_chain_drafter: bool, multi_batch: bool, attention_dp: bool,
+             request):
     # Eagle3 one model works with overlap scheduler and block reuse.
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
     if total_mem_gb < 35:
@@ -136,7 +136,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
             num_tokens = len(new_tokens)
 
         accept_rate = num_accepted / num_drafted
-        assert accept_rate > 0.15
+        assert accept_rate > 0.10
 
     # Output tests
     sampling_params = SamplingParams(max_tokens=10, temperature=0)