fix: MTP in chunked prefill mode

fabian-oss6 · fabian-oss6 · commit 4a8bd69104b6 · 2025-10-14T13:54:07.000+08:00
diff --git a/lightllm/common/basemodel/batch_objs.py b/lightllm/common/basemodel/batch_objs.py
@@ -31,6 +31,7 @@ class ModelInput:
     # prefill 阶段使用的参数，但是不是推理过程使用的参数，是推理外部进行资源管理
     # 的一些变量
     b_prefill_has_output_cpu: List[bool] = None  # 标记进行prefill的请求是否具有输出
+    b_chunked_prefill_next_token_ids_cpu: List[int] = None  # for chunked prefill mtp
 
     # 专有变量，用于一些特殊的模型，特殊的模式下, 传递一些特殊
     # 的输入变量。只在特殊的模型模式下才会具体使用和生效。
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -393,7 +393,13 @@ def get_input_token_ids(self):
     def get_chuncked_input_token_ids(self):
         chunked_start = self.cur_kv_len
         chunked_end = min(self.get_cur_total_len(), chunked_start + self.shm_req.chunked_prefill_size)
-        return self.shm_req.shm_prompt_ids.arr[0:chunked_end]
+
+        if chunked_end < self.get_cur_total_len():
+            next_token_id = self.shm_req.shm_prompt_ids.arr[chunked_end]
+        else:
+            next_token_id = -1  # last chunk
+
+        return self.shm_req.shm_prompt_ids.arr[0:chunked_end], next_token_id
 
     def get_chuncked_input_token_len(self):
         chunked_start = self.cur_kv_len
@@ -438,7 +444,7 @@ def _stop_sequences_matched(self, output_len: int):
 
     def prefill_need_token_num(self, is_chuncked_prefill: bool):
         if is_chuncked_prefill:
-            input_token_ids = self.get_chuncked_input_token_ids()
+            input_token_ids, _ = self.get_chuncked_input_token_ids()
         else:
             input_token_ids = self.get_input_token_ids()
 
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -192,8 +192,14 @@ def prefill_mtp(
                 mask_func=self.prefill_mask_func,
             )
             # mtp kv fill
+            b_has_out = torch.tensor(model_input.b_prefill_has_output_cpu, dtype=torch.bool, device="cuda")
+            b_chunked_next_token_ids = torch.tensor(
+                model_input.b_chunked_prefill_next_token_ids_cpu, dtype=torch.int64, device="cuda"
+            )
+            mtp_next_token_ids = torch.where(b_has_out, next_token_ids, b_chunked_next_token_ids)
+
             self._draft_prefill_forward(
-                model_input=model_input, model_output=model_output, next_token_ids=next_token_ids
+                model_input=model_input, model_output=model_output, next_token_ids=mtp_next_token_ids
             )
             sync_event = torch.cuda.Event()
             sync_event.record()
diff --git a/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py
@@ -354,7 +354,13 @@ def prefill_mtp(self, event_pack: OverlapEventPack, prefill_reqs: List[InferReq]
             # mtp kv fill
             draft_next_token_ids_gpu = torch.zeros((model_input.batch_size), dtype=torch.int64, device="cuda")
             if req_num > 0:
-                draft_next_token_ids_gpu[0:req_num].copy_(next_token_ids)
+                b_has_out = torch.tensor(b_has_out_cpu, dtype=torch.bool, device="cuda")
+                b_chunked_next_token_ids = torch.tensor(
+                    model_input.b_chunked_prefill_next_token_ids_cpu[0:req_num], dtype=torch.int64, device="cuda"
+                )
+                mtp_next_token_ids = torch.where(b_has_out, next_token_ids, b_chunked_next_token_ids)
+                draft_next_token_ids_gpu[0:req_num].copy_(mtp_next_token_ids)
+
             self._draft_prefill_forward(
                 model_input=model_input,
                 model_output=model_output,
@@ -633,13 +639,27 @@ def prefill_overlap_mtp(self, event_pack: OverlapEventPack, prefill_reqs: List[I
             draft_model_input0, draft_model_input1 = model_input0, model_input1
             draft_next_token_ids_gpu0 = torch.zeros((model_input0.batch_size), dtype=torch.int64, device="cuda")
             if req_num0 > 0:
-                draft_next_token_ids_gpu0[0:req_num0].copy_(next_token_ids[0:req_num0], non_blocking=True)
+                b_has_out0 = torch.tensor(
+                    model_input0.b_prefill_has_output_cpu[0:req_num0], dtype=torch.bool, device="cuda"
+                )
+                b_chunked_next_token_ids0 = torch.tensor(
+                    model_input0.b_chunked_prefill_next_token_ids_cpu[0:req_num0], dtype=torch.int64, device="cuda"
+                )
+                mtp_next_token_ids0 = torch.where(b_has_out0, next_token_ids[0:req_num0], b_chunked_next_token_ids0)
+                draft_next_token_ids_gpu0[0:req_num0].copy_(mtp_next_token_ids0, non_blocking=True)
 
             draft_next_token_ids_gpu1 = torch.zeros((model_input1.batch_size), dtype=torch.int64, device="cuda")
             if req_num1 > 0:
-                draft_next_token_ids_gpu1[0:req_num1].copy_(
-                    next_token_ids[req_num0 : (req_num0 + req_num1)], non_blocking=True
+                b_has_out1 = torch.tensor(
+                    model_input1.b_prefill_has_output_cpu[0:req_num1], dtype=torch.bool, device="cuda"
+                )
+                b_chunked_next_token_ids1 = torch.tensor(
+                    model_input1.b_chunked_prefill_next_token_ids_cpu[0:req_num1], dtype=torch.int64, device="cuda"
+                )
+                mtp_next_token_ids1 = torch.where(
+                    b_has_out1, next_token_ids[req_num0 : (req_num0 + req_num1)], b_chunked_next_token_ids1
                 )
+                draft_next_token_ids_gpu1[0:req_num1].copy_(mtp_next_token_ids1, non_blocking=True)
 
             draft_model_output0, draft_model_output1 = model_output0, model_output1
 
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py
@@ -36,14 +36,16 @@ def padded_prepare_prefill_inputs(
     b_ready_cache_len = []
     b_mtp_index = []
     b_prefill_has_output = []
+    b_chunked_prefill_next_token_ids = []
 
     for req in req_objs:
 
         run_reqs.append(req)
         batch_multimodal_params.append(req.multimodal_params)
         b_req_idx.append(req.req_idx)
 
-        input_token_ids = req.get_chuncked_input_token_ids()
+        input_token_ids, next_token_id = req.get_chuncked_input_token_ids()
+        b_chunked_prefill_next_token_ids.append(next_token_id)
         b_prefill_has_output.append(False if len(input_token_ids) < req.get_cur_total_len() else True)
         seq_len = len(input_token_ids)
         input_token_len = seq_len - req.cur_kv_len
@@ -65,6 +67,7 @@ def padded_prepare_prefill_inputs(
         b_q_seq_len.append(1)
         b_mtp_index.append(0)
         b_prefill_has_output.append(False)
+        b_chunked_prefill_next_token_ids.append(-1)
         b_ready_cache_len.append(0)
         total_token_num += 1
         prefix_total_token_num += 0
@@ -112,6 +115,7 @@ def padded_prepare_prefill_inputs(
         b_ready_cache_len=b_ready_cache_len,
         is_prefill=True,
         b_prefill_has_output_cpu=b_prefill_has_output,
+        b_chunked_prefill_next_token_ids_cpu=b_chunked_prefill_next_token_ids,
     )
     if is_multimodal:
         model_input.multimodal_params = batch_multimodal_params
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py
@@ -20,14 +20,16 @@ def prepare_prefill_inputs(
     b_ready_cache_len = []
     b_mtp_index = []
     b_prefill_has_output = []
+    b_chunked_prefill_next_token_ids = []
 
     for req in req_objs:
         run_reqs.append(req)
         batch_multimodal_params.append(req.multimodal_params)
         b_req_idx.append(req.req_idx)
 
         if is_chuncked_mode:
-            input_token_ids = req.get_chuncked_input_token_ids()
+            input_token_ids, next_token_id = req.get_chuncked_input_token_ids()
+            b_chunked_prefill_next_token_ids.append(next_token_id)
         else:
             input_token_ids = req.get_input_token_ids()
 
@@ -80,6 +82,7 @@ def prepare_prefill_inputs(
         b_ready_cache_len=b_ready_cache_len,
         is_prefill=True,
         b_prefill_has_output_cpu=b_prefill_has_output,
+        b_chunked_prefill_next_token_ids_cpu=b_chunked_prefill_next_token_ids,
         prefix_total_token_num=prefix_total_token_num,
     )
     if is_multimodal: