make number of tokens more accurate to scenario

TJ5 · TJ5 · commit b59fa7fd7d68 · 2025-07-09T17:33:04.000-07:00
diff --git a/genai_bench/sampling/text.py b/genai_bench/sampling/text.py
@@ -187,7 +187,7 @@ def _sample_prefix(self, current_prefix_length) -> str:
                     prefix += line[: int(remaining_prefix_len * char_to_token_ratio)]
                     return prefix
                 prefix += line
-                prefix_tokens_len += line_tokens_len
+                prefix_tokens_len = self.get_token_length(prefix)
 
         return prefix
 
@@ -269,7 +269,7 @@ def _sample_text(self, num_input_tokens: int) -> str:
                     prompt += line[: int(left_tokens_to_sample * char_to_token_ratio)]
                     return prompt
                 prompt += line
-                left_tokens_to_sample -= tokens
+                left_tokens_to_sample = num_input_tokens - self.get_token_length(prompt)
         return prompt
 
     def _sample_prompt(self) -> str:
diff --git a/tests/sampling/test_text.py b/tests/sampling/test_text.py
@@ -60,7 +60,12 @@ def test_check_discrepancy_no_warning(self):
             logger.removeHandler(ch)
 
     def test_sample_chat_request(self):
-        self.tokenizer.encode.return_value = [1, 2, 3, 4, 5]
+        def mock_get_token_length(text, add_special_tokens=False):
+            return len(text) // 4  # Simple approximation: 4 chars per token
+
+        # Override the get_token_length method with our mock
+        self.sampler.get_token_length = mock_get_token_length
+
         scenario = NormalDistribution(
             mean_input_tokens=10,
             stddev_input_tokens=2,
@@ -84,7 +89,13 @@ def test_sample_chat_request_with_dataset(self):
             data=self.test_data,
             use_scenario=False,
         )
-        self.tokenizer.encode.return_value = [1, 2, 3, 4, 5]
+
+        def mock_get_token_length(text, add_special_tokens=False):
+            return len(text) // 4  # Simple approximation: 4 chars per token
+
+        # Override the get_token_length method with our mock
+        no_scenario_sampler.get_token_length = mock_get_token_length
+
         scenario = NormalDistribution(
             mean_input_tokens=10,
             stddev_input_tokens=2,
@@ -102,13 +113,18 @@ def test_sample_chat_request_with_dataset(self):
         )  # Should be None for non-scenario sampling
 
     def test_sample_embedding_request(self):
-        self.tokenizer.encode.return_value = [1, 2, 3, 4, 5]
+        def mock_get_token_length(text, add_special_tokens=False):
+            return len(text) // 4  # Simple approximation: 4 chars per token
+
         embedding_sampler = TextSampler(
             tokenizer=self.tokenizer,
             model=self.model,
             output_modality="embeddings",
             data=self.test_data,
         )
+        # Override the get_token_length method with our mock
+        embedding_sampler.get_token_length = mock_get_token_length
+
         scenario = EmbeddingScenario(tokens_per_document=1024)
 
         request = embedding_sampler.sample(scenario)
@@ -119,13 +135,20 @@ def test_sample_embedding_request(self):
         self.assertTrue(len(request.documents) > 0)
 
     def test_sample_rerank_request(self):
-        self.tokenizer.encode.return_value = [1, 2, 3, 4, 5]
+        # Mock get_token_length to return different values based on input length
+        # This prevents infinite loops in _sample_text()
+        def mock_get_token_length(text, add_special_tokens=False):
+            return len(text) // 2  # Simple approximation: 2 chars per token
+
         rerank_sampler = TextSampler(
             tokenizer=self.tokenizer,
             model=self.model,
             output_modality="rerank",
             data=self.test_data,
         )
+        # Override the get_token_length method with our mock
+        rerank_sampler.get_token_length = mock_get_token_length
+
         scenario = ReRankScenario(tokens_per_document=1024, tokens_per_query=100)
 
         request = rerank_sampler.sample(scenario)