implement async scheduling for mtp

Ronald1995 · Ronald1995 · commit c7c093fc692d · 2025-11-28T17:07:48.000+08:00
Signed-off-by: Ronald1995 &lt;ronaldautomobile@163.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -348,7 +348,7 @@ def build(
                              device=query_start_loc_cpu.device)
             ])
 
-        query_start_loc = query_start_loc_cpu.to(self.device,
+        query_start_loc = query_start_loc_cpu.pin_memory().to(self.device,
                                                  non_blocking=True)
 
         if get_ascend_device_type() == AscendDeviceType._310P:
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -566,10 +566,13 @@ def build(
                         out=padded_local_cu_chunk_seq_lens_cpu[:, 1:],
                         dtype=torch.int32,
                     )
-                    chunked_context_metadata = \
-                    AscendMLAPrefillMetadata.ChunkedContextMetadata(
-                        cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
-                        starts=local_chunk_starts.to(device, non_blocking=True),
+                    chunked_context_metadata = AscendMLAPrefillMetadata.ChunkedContextMetadata(
+                        cu_seq_lens=cu_seq_lens_cpu.pin_memory().to(
+                            device, non_blocking=True
+                        ),
+                        starts=local_chunk_starts.pin_memory().to(
+                            device, non_blocking=True
+                        ),
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         chunk_seq_lens=chunk_seq_lens,
@@ -578,22 +581,27 @@ def build(
                         padded_chunk_seq_lens_npu=padded_local_chunk_seq_lens.npu(),
                         padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
                         local_context_lens_allranks=local_context_lens_allranks.tolist(),
-                        padded_local_cu_seq_lens=padded_local_cu_chunk_seq_lens_cpu.to(
+                        padded_local_cu_seq_lens=padded_local_cu_chunk_seq_lens_cpu.pin_memory().to(
                             device, non_blocking=True
                         ),
                         cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
                         chunk_size=padded_local_max_context_chunk_across_ranks,
                     )
                 else:
-                    chunked_context_metadata = \
+                    chunked_context_metadata = (
                         AscendMLAPrefillMetadata.ChunkedContextMetadata(
-                        cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
-                        starts=chunk_starts.to(device, non_blocking=True),
-                        seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
-                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
-                        chunk_seq_lens=chunk_seq_lens,
-                        chunk_seq_lens_npu=chunk_seq_lens.npu(),
-                        workspace=self.chunked_prefill_workspace,
+                            cu_seq_lens=cu_seq_lens_cpu.pin_memory().to(
+                                device, non_blocking=True
+                            ),
+                            starts=chunk_starts.pin_memory().to(
+                                device, non_blocking=True
+                            ),
+                            seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                            max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                            chunk_seq_lens=chunk_seq_lens,
+                            chunk_seq_lens_npu=chunk_seq_lens.npu(),
+                            workspace=self.chunked_prefill_workspace,
+                        )
                     )
             prefill_input_positions = input_positions[tokens_start:]
             cos = self.cos_cache[
@@ -626,7 +634,7 @@ def build(
             cos = common_attn_metadata.cos
             sin = common_attn_metadata.sin
             # Notice that num_decodes != num_decode_tokens in SpecDecoding Scenario
-            actual_seq_lengths_q = query_start_loc[1:num_decodes + 1].tolist()
+            actual_seq_lengths_q = query_start_loc_cpu[1:num_decodes + 1].tolist()
             max_seq_lens = seq_lens[:num_decodes].max().item()
             seq_lens = seq_lens[:num_decodes]
             input_positions = input_positions[:num_decode_tokens]
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
@@ -317,21 +317,27 @@ def rejection_greedy_sample_pytorch(
         draft_token_ids,  # [num_tokens]
         target_argmax,  # [num_tokens]
         bonus_token_ids,  # [batch_size]
-        draft_tokens_per_req,  # [batch_size], list
+        draft_tokens_per_req_cpu,  # [batch_size], list
         max_spec_len,
         is_greedy=None,  # [batch_size] or None
 ):
     batch_size = output_token_ids.size(0)
     num_tokens = draft_token_ids.size(0)
     device = output_token_ids.device
-    draft_tokens_per_req = torch.tensor(draft_tokens_per_req).to(
-        device, non_blocking=True)
+    draft_tokens_per_req = (
+        torch.tensor(draft_tokens_per_req_cpu)
+        .pin_memory()
+        .to(device, non_blocking=True)
+    )
     if is_greedy is None:
         is_greedy = torch.ones(batch_size, dtype=torch.bool, device=device)
 
     start_indices = cu_num_draft_tokens - draft_tokens_per_req
     req_ids = torch.arange(batch_size, device=device)
-    token_req_ids = torch.repeat_interleave(req_ids, draft_tokens_per_req)
+    total_draft_tokens = sum(draft_tokens_per_req_cpu)
+    token_req_ids = torch.repeat_interleave(
+        req_ids, draft_tokens_per_req, output_size=total_draft_tokens
+    )
     token_positions = torch.arange(
         num_tokens, device=device) - start_indices[token_req_ids]
 
@@ -357,8 +363,11 @@ def rejection_greedy_sample_pytorch(
                                          max_spec_len * 2)
         first_mismatch_pos_per_req, _ = torch.min(mismatch_positions, dim=1)
         no_mismatch_mask = (first_mismatch_pos_per_req == max_spec_len * 2)
-        first_mismatch_pos_per_req[no_mismatch_mask] = draft_tokens_per_req[
-            no_mismatch_mask]
+        first_mismatch_pos_per_req = torch.where(
+            no_mismatch_mask,
+            draft_tokens_per_req,
+            first_mismatch_pos_per_req,
+        )
 
     # Copy matched target tokens into output.
     copy_len = torch.minimum(first_mismatch_pos_per_req + 1,
@@ -369,16 +378,19 @@ def rejection_greedy_sample_pytorch(
     greedy_mask = is_greedy.unsqueeze(1)
     final_copy_mask = copy_mask & greedy_mask
     global_idx = start_indices.unsqueeze(1) + copy_indices
-    output_token_ids[final_copy_mask] = target_argmax[
-        global_idx[final_copy_mask]].to(output_token_ids.dtype)
+    output_token_ids_ = torch.where(
+        final_copy_mask,
+        target_argmax[global_idx].to(output_token_ids.dtype),
+        output_token_ids
+    )
+    output_token_ids.copy_(output_token_ids_)
     # Fill bonus token.
     needs_bonus = is_greedy & (first_mismatch_pos_per_req
                                >= draft_tokens_per_req)
-    if torch.any(needs_bonus):
-        bonus_rows = torch.where(needs_bonus)[0]
-        bonus_cols = draft_tokens_per_req[bonus_rows]
-        bonus_token_ids = bonus_token_ids.squeeze(1)
-        output_token_ids[bonus_rows, bonus_cols] = bonus_token_ids[bonus_rows]
+    bonus_rows = torch.where(needs_bonus)[0]
+    bonus_cols = draft_tokens_per_req[bonus_rows]
+    bonus_token_ids = bonus_token_ids.squeeze(1)
+    output_token_ids[bonus_rows, bonus_cols] = bonus_token_ids[bonus_rows]
 
 
 def rejection_random_sample_pytorch(
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -144,6 +144,9 @@ def __init__(
         self.arange = torch.arange(max_num_slots_for_arange,
                                    device=device,
                                    dtype=torch.int32)
+        self.arange_cpu = torch.arange(
+            max_num_slots_for_arange, device="cpu", dtype=torch.int32
+        )
 
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
@@ -159,6 +162,7 @@ def __init__(
         )
         self.use_sparse = hasattr(vllm_config.model_config.hf_config,
                                   "index_topk")
+        self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
 
     def load_model(self, model) -> None:
         loader = get_model_loader(self.vllm_config.load_config)
@@ -342,6 +346,7 @@ def generate_token_ids(self,
                     self.runner.discard_request_indices.gpu,
                     self.runner.num_discarded_requests
                 )
+            self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
 
         req_scheduled_tokens = scheduler_output.num_scheduled_tokens
         if self.pcp_size > 1:
@@ -421,6 +426,24 @@ def generate_token_ids(self,
         )
 
         return draft_token_ids
+    
+    def _copy_valid_sampled_token_count(
+        self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor
+    ) -> None:
+        if self.runner.valid_sampled_token_count_event is not None:
+            default_stream = torch.npu.current_stream()
+            # initialize a new stream to overlap the copy operation with
+            # prepare_input of draft model.
+            with torch.npu.stream(self.runner.valid_sampled_token_count_copy_stream):
+                self.runner.valid_sampled_token_count_copy_stream.wait_stream(
+                    default_stream
+                )  # type: ignore
+                self.runner.valid_sampled_token_count_cpu[
+                    : valid_sampled_tokens_count.shape[0]
+                ].copy_(valid_sampled_tokens_count, non_blocking=True)
+                self.runner.valid_sampled_token_count_event.record()
+
+            self.runner.input_batch.prev_sampled_token_ids = next_token_ids.unsqueeze(1)
 
     def _init_mtp_model(self):
         architecture = self.vllm_config.model_config.architecture
@@ -689,7 +712,11 @@ def _propose(
                                                uniform_decode=False)
         aclgraph_runtime_mode, batch_descriptor = \
             self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
-
+        if self.use_async_scheduling:
+            # there is synchronize between mtp steps when enable aclgraph,
+            # disable aclgraph when use async scheduling to avoid the
+            # synchronize overhead.
+            aclgraph_runtime_mode = CUDAGraphMode.NONE
         if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
         ) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
             graph_pad_size = num_input_tokens
@@ -795,7 +822,7 @@ def _propose(
             # When disable_padded_drafter_batch=False, it should not to be updating these params, maybe.
             if self.speculative_config.disable_padded_drafter_batch or \
                     aclgraph_runtime_mode != CUDAGraphMode.FULL:
-                attn_metadata_i.decode.actual_seq_lengths_q = attn_metadata_i.query_start_loc[
+                attn_metadata_i.decode.actual_seq_lengths_q = self.arange_cpu[
                     1:batch_size + 1].tolist()
                 if aclgraph_runtime_mode == CUDAGraphMode.FULL:
                     attn_metadata_i.decode.actual_seq_lengths_q = \
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py