From 4c9aac95bf6453f1db4b1a2a491de14af000261d Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Fri, 5 Dec 2025 20:30:52 +0000 Subject: [PATCH 1/2] fix incorrect shared prefix prompt length --- .../datagen/shared_prefix_datagen.py | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/inference_perf/datagen/shared_prefix_datagen.py b/inference_perf/datagen/shared_prefix_datagen.py index f949a90a..fc9ea2fd 100644 --- a/inference_perf/datagen/shared_prefix_datagen.py +++ b/inference_perf/datagen/shared_prefix_datagen.py @@ -86,44 +86,67 @@ def get_data(self) -> Generator[InferenceAPIData, None, None]: yield LazyLoadInferenceAPIData(data_index=i, prefered_worker_id=prefered_worker_id) i += 1 - def _generate_random_token_ids(self, length: int) -> List[int]: + def _generate_random_token_ids(self, length: int, prefix_token_ids: List[int]) -> List[int]: """Generates a list of random token IDs of a specified length.""" if length == 0: return [] + hf_tokenizer = self.tokenizer.get_tokenizer() + prefix_prompt_len = self.tokenizer.count_tokens(hf_tokenizer.decode(prefix_token_ids, skip_special_tokens=True)) + + if prefix_prompt_len > length: + raise ValueError(f"Prefix length ({prefix_prompt_len}) exceeds requested length ({length}).") + + random_part_size = length - prefix_prompt_len + 5 + # np.random.randint's high parameter is exclusive - return np.random.randint(0, self.vocab_size, size=length, dtype=np.int64).tolist() # type: ignore[no-any-return] + token_ids = prefix_token_ids + np.random.randint(0, self.vocab_size, size=random_part_size, dtype=np.int64).tolist() + prompt_text = hf_tokenizer.decode(token_ids, skip_special_tokens=True) + + while length < self.tokenizer.count_tokens(prompt_text): + token_ids.pop() + prompt_text = hf_tokenizer.decode(token_ids, skip_special_tokens=True) + + # if trimmed too many tokens, retry + if length > self.tokenizer.count_tokens(prompt_text): + token_ids = prefix_token_ids + np.random.randint(0, self.vocab_size, size=random_part_size, dtype=np.int64).tolist() + prompt_text = hf_tokenizer.decode(token_ids, skip_special_tokens=True) + + return token_ids def _generate_prompts(self) -> None: """Pre-generates all prompts based on the configuration.""" if self.tokenizer is None: - # This check is defensive; __init__ should have already validated this. raise ValueError("Tokenizer is not available for generating prompts.") hf_tokenizer = self.tokenizer.get_tokenizer() for group_id in range(self.num_groups): - # Generate a shared prefix (system prompt) - shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len) + shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len, prefix_token_ids=[]) + shared_prefix_text = hf_tokenizer.decode(shared_prefix_token_ids, skip_special_tokens=True) for prompt_id in range(self.num_prompts_per_group): - # Generate a unique question - question_token_ids = self._generate_random_token_ids(self.question_len) - question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True) + total_target_length = self.system_prompt_len + self.question_len + + full_token_ids = self._generate_random_token_ids( + length=total_target_length, + prefix_token_ids=shared_prefix_token_ids + ) if self.enable_multi_turn_chat: - # multi turn chat, create user to keep conversation + + question_token_ids = full_token_ids[len(shared_prefix_token_ids):] + question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True) + self.user_sessions.append( LocalUserSession( user_session_id=f"user_session_{self.num_prompts_per_group * group_id + prompt_id}", context=shared_prefix_text, ) ) + self.prompts.append(question_text) else: - # Single turn chat, Combine shared prefix and question - question_text = shared_prefix_text + " " + question_text - - self.prompts.append(question_text) + full_prompt_text = hf_tokenizer.decode(full_token_ids, skip_special_tokens=True) + self.prompts.append(full_prompt_text) - # Shuffle the generated prompts to ensure randomness if served sequentially by different workers random.shuffle(self.prompts) From 613c9ad51784e995d28c7a5ded711cdd2f842029 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Mon, 8 Dec 2025 16:20:03 +0000 Subject: [PATCH 2/2] report prompt length for failed requests --- inference_perf/apis/chat.py | 5 ++--- inference_perf/apis/completion.py | 3 +-- inference_perf/datagen/shared_prefix_datagen.py | 4 +++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/inference_perf/apis/chat.py b/inference_perf/apis/chat.py index 09650981..c081be55 100644 --- a/inference_perf/apis/chat.py +++ b/inference_perf/apis/chat.py @@ -51,6 +51,8 @@ async def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool, s async def process_response(self, response: ClientResponse, config: APIConfig, tokenizer: CustomTokenizer) -> InferenceInfo: if config.streaming: + prompt_text = "".join([msg.content for msg in self.messages if msg.content]) + prompt_len = tokenizer.count_tokens(prompt_text) output_text = "" output_token_times: List[float] = [] buffer = b"" @@ -77,9 +79,6 @@ async def process_response(self, response: ClientResponse, config: APIConfig, to else: continue break - - prompt_text = "".join([msg.content for msg in self.messages if msg.content]) - prompt_len = tokenizer.count_tokens(prompt_text) output_len = tokenizer.count_tokens(output_text) return InferenceInfo( input_tokens=prompt_len, diff --git a/inference_perf/apis/completion.py b/inference_perf/apis/completion.py index 35d3980c..c4069ab2 100644 --- a/inference_perf/apis/completion.py +++ b/inference_perf/apis/completion.py @@ -47,6 +47,7 @@ async def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool, s async def process_response(self, response: ClientResponse, config: APIConfig, tokenizer: CustomTokenizer) -> InferenceInfo: if config.streaming: + prompt_len = tokenizer.count_tokens(self.prompt) output_text = "" output_token_times: List[float] = [] buffer = b"" @@ -70,8 +71,6 @@ async def process_response(self, response: ClientResponse, config: APIConfig, to else: continue break - - prompt_len = tokenizer.count_tokens(self.prompt) output_len = tokenizer.count_tokens(output_text) self.model_response = output_text return InferenceInfo( diff --git a/inference_perf/datagen/shared_prefix_datagen.py b/inference_perf/datagen/shared_prefix_datagen.py index fc9ea2fd..80470da9 100644 --- a/inference_perf/datagen/shared_prefix_datagen.py +++ b/inference_perf/datagen/shared_prefix_datagen.py @@ -116,6 +116,7 @@ def _generate_random_token_ids(self, length: int, prefix_token_ids: List[int]) - def _generate_prompts(self) -> None: """Pre-generates all prompts based on the configuration.""" if self.tokenizer is None: + # This check is defensive; __init__ should have already validated this. raise ValueError("Tokenizer is not available for generating prompts.") hf_tokenizer = self.tokenizer.get_tokenizer() @@ -134,7 +135,7 @@ def _generate_prompts(self) -> None: ) if self.enable_multi_turn_chat: - + # multi turn chat, create user to keep conversation question_token_ids = full_token_ids[len(shared_prefix_token_ids):] question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True) @@ -146,6 +147,7 @@ def _generate_prompts(self) -> None: ) self.prompts.append(question_text) else: + # Single turn chat, Combine shared prefix and question full_prompt_text = hf_tokenizer.decode(full_token_ids, skip_special_tokens=True) self.prompts.append(full_prompt_text)