33from ...batch import Batch , Req
44from lightllm .server .router .req_queue .base_queue import BaseQueue
55from lightllm .common .basemodel .infer_lock import g_router_lock
6+ from lightllm .utils .envs_utils import get_page_size
7+
8+
9+ def cdiv (a , b ):
10+ return (a + b - 1 ) // b
611
712
813class ChunkedPrefillQueue (BaseQueue ):
@@ -21,8 +26,9 @@ def _init_cache_list(self, current_batch: Batch, is_busy):
2126 return
2227
2328 # @calculate_time(show=True, min_cost_ms=0.1)
24- def _can_add_new_req (self , req : Req , is_busy , new_batch_first_router_need_tokens ):
25- self .cache_len_list .append (req .get_tuple_tokens (is_busy , self .router_max_new_token_len )) # hard to analysis
29+ def _can_add_new_req (self , req : Req , is_busy , new_batch_first_router_need_tokens , new_batch_prefill_need_pages ):
30+ token_infos = req .get_tuple_tokens (is_busy , self .router_max_new_token_len )
31+ self .cache_len_list .append (token_infos ) # hard to analysis
2632 self .cache_len_list .sort (key = lambda x : - x [1 ])
2733
2834 left_out_len_array = np .array ([e [1 ] for e in self .cache_len_list ])
@@ -42,16 +48,29 @@ def _can_add_new_req(self, req: Req, is_busy, new_batch_first_router_need_tokens
4248 new_batch_first_router_need_tokens += req .get_first_router_need_tokens ()
4349 ok_prefill = new_batch_first_router_need_tokens <= self .batch_max_tokens
4450
45- if ok_token_num and ok_req_num and ok_prefill :
51+ # 检查page
52+ ok_page_num = True
53+ if "page_size_variable" in self .router .mode :
54+ available_pages = self .router .read_only_statics_mem_manager .get_unrefed_page_num (self .dp_index )
55+ page_size = get_page_size ()
56+ if self .router .radix_cache_client is not None :
57+ radix_cache = self .router .radix_cache_client
58+ available_pages += radix_cache .get_unrefed_tokens_num (self .dp_index ) // page_size
59+
60+ new_batch_prefill_need_pages += cdiv (req .input_len + req .shm_cur_output_len , page_size )
61+ decode_need_pages = cdiv ((left_out_len_array * size_array ).max (), page_size )
62+ ok_page_num = new_batch_prefill_need_pages + decode_need_pages < available_pages
63+
64+ if ok_token_num and ok_req_num and ok_prefill and ok_page_num :
4665 self .router .shared_token_load .set_estimated_peak_token_count (need_max_token_num , self .dp_index )
4766 self .router .shared_token_load .set_dynamic_max_load (
4867 (need_max_token_num + self .router .shared_token_load .get_frozened_token_count (self .dp_index ))
4968 / self .max_total_tokens ,
5069 self .dp_index ,
5170 )
52- return True , new_batch_first_router_need_tokens
71+ return True , new_batch_first_router_need_tokens , new_batch_prefill_need_pages
5372 else :
54- return False , new_batch_first_router_need_tokens
73+ return False , new_batch_first_router_need_tokens , new_batch_prefill_need_pages
5574
5675 # @calculate_time(show=True, min_cost_ms=10)
5776 def generate_new_batch (self , current_batch : Batch ):
@@ -77,15 +96,16 @@ def generate_new_batch(self, current_batch: Batch):
7796
7897 waiting_queue = self .waiting_req_list
7998
99+ new_batch_prefill_need_pages = cdiv (new_batch_first_router_need_tokens , get_page_size ())
80100 for req in waiting_queue :
81101 if req .is_aborted :
82102 # 由于管理的复杂性,只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
83103 # 暂停的请求需要恢复后,由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token的泄漏
84104 aborted_count += 1
85105 abort_req_list .append (req )
86106 continue
87- ok_insert , new_batch_first_router_need_tokens = self ._can_add_new_req (
88- req , is_busy , new_batch_first_router_need_tokens
107+ ok_insert , new_batch_first_router_need_tokens , new_batch_prefill_need_pages = self ._can_add_new_req (
108+ req , is_busy , new_batch_first_router_need_tokens , new_batch_prefill_need_pages
89109 )
90110 if ok_insert :
91111 can_run_list .append (req )
0 commit comments