Optimizations (#2)

quic-sanising · quic-sanising · commit d48d084d5823 · 2025-04-24T13:19:29.000-05:00
* Initial commit

* Reformat code

* Fix bug

* Add Gumbel-Max trick based random sampling

* Bring up to date

* Use Gumbel-Max Trick based Random Sampling as default

* Clip k to max value

* Add docstring for sampling parameters

* Fix bug

* Add support for continuous batching

* Fix ONNX error for batch_size 1 treated as a Constant

* Undo docstring deletion

* Remove device and unncessary reshapes

* Revert batch_size to 1

* Remove vocab_size from dynamic axes

* Change condition

* Change size of each sampling parameter to (batch_size, 1)

* Reformat code

* Add optimizations

* Identify optimizations

* Fix bug

* Fix merge issue

* Optimizations:
Perform random sampling only on topk_values_asc
Only need logits for probs when self.return_pdfs is True

* Remove where clause for temperature

* Remove boolean type casting for retain state

* Always return next_tokens

* Fix bug

* Reformat code

* Initialize retain states

* Optimize imports

* Remove torch.index_select()

* Change dtype of penalty buffers to bool

---------

Signed-off-by: quic-sanising &lt;quic_sanising@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -38,8 +38,8 @@
     CustomOpsTransform,
     KVCacheModuleMethodMapperTransform,
     KVCacheTransform,
-    SpDTransform,
     SamplerTransform,
+    SpDTransform,
     VlmKVOffloadTransform,
     VlmNoKVOffloadTransform,
 )
@@ -1483,7 +1483,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
             dynamic_axes["last_accepted_output_tokens"] = {0: "batch_size", 1: "num_logits_to_keep"}
 
             example_inputs["past_repetition_penalty_buffer"] = torch.zeros(
-                fbs if self.continuous_batching else bs, self.model.config.vocab_size, dtype=torch.int32)
+                fbs if self.continuous_batching else bs, self.model.config.vocab_size, dtype=torch.bool)
             dynamic_axes["past_repetition_penalty_buffer"] = {
                 0: "full_batch_size" if self.continuous_batching else "batch_size",
             }
@@ -1493,7 +1493,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
             dynamic_axes["repetition_penalties"] = {0: "batch_size"}
 
             example_inputs["past_presence_penalty_buffer"] = torch.zeros(
-                fbs if self.continuous_batching else bs, self.model.config.vocab_size, dtype=torch.int32)
+                fbs if self.continuous_batching else bs, self.model.config.vocab_size, dtype=torch.bool)
             dynamic_axes["past_presence_penalty_buffer"] = {
                 0: "full_batch_size" if self.continuous_batching else "batch_size",
             }
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -246,8 +246,8 @@
     QEffWhisperModel,
     QEffWhisperPositionalEmbedding,
 )
-from QEfficient.transformers.spd.causal_lm_forward import tlm_forward
 from QEfficient.transformers.sampler.sampler import sampler_forward
+from QEfficient.transformers.spd.causal_lm_forward import tlm_forward
 
 class CustomOpsTransform(ModuleMappingTransform):
     _module_mapping = {
diff --git a/QEfficient/transformers/sampler/sampler.py b/QEfficient/transformers/sampler/sampler.py
@@ -5,7 +5,7 @@
 # from QEfficient.customop import CtxScatterFunc
 from QEfficient.utils.constants import Constants
 from transformers.cache_utils import Cache
-from transformers.modeling_outputs import ModelOutput, CausalLMOutputWithPast
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
 from typing import List, Optional, Tuple, Union
 
 
@@ -217,16 +217,19 @@ def sampler_forward(
 
     # Select relevant rows
     batch_index_reshaped = batch_index.view(-1)
-    past_repetition_penalty_buffer_selected = torch.index_select(past_repetition_penalty_buffer, 0, batch_index_reshaped)
-    past_presence_penalty_buffer_selected = torch.index_select(past_presence_penalty_buffer, 0, batch_index_reshaped)
+    past_repetition_penalty_buffer_selected = past_repetition_penalty_buffer[batch_index_reshaped]
+    past_presence_penalty_buffer_selected = past_presence_penalty_buffer[batch_index_reshaped]
 
     logits = logits.reshape(-1, vocab_size)  # Reshape tensor to 2D
 
     if input_ids.shape[1] > spec_length:  # Prefill phase, initialize retained states
         # TODO: Replace scatter_ with CtxScatterFunc; Replace -1 with int_max while exporting on onnx
         # past_repetition_penalty_buffer_selected = CtxScatterFunc.apply(past_repetition_penalty_buffer_selected.unsqueeze(1), input_ids, 1).squeeze(1)
+        if position_ids[0, 0] == 0:
+            past_repetition_penalty_buffer_selected = torch.mul(past_repetition_penalty_buffer_selected, 0)
+            past_presence_penalty_buffer_selected = torch.mul(past_presence_penalty_buffer_selected, 0)
         past_repetition_penalty_buffer_selected.scatter_(1, input_ids, 1)
-        past_presence_penalty_buffer_selected.scatter_(1, input_ids, 0)
+
     else:  # Decode phase, update retained states
         past_repetition_penalty_buffer_selected.scatter_(1, last_accepted_output_tokens, 1)
         past_presence_penalty_buffer_selected.scatter_(1, last_accepted_output_tokens, 1)
@@ -236,11 +239,26 @@ def sampler_forward(
     past_repetition_penalty_buffer[batch_index_reshaped] = past_repetition_penalty_buffer_selected
     past_presence_penalty_buffer[batch_index_reshaped] = past_presence_penalty_buffer_selected
 
+    # Greedy Sampling
+    greedy_samples = torch.argmax(logits, dim=1, keepdim=True)  # (batch_size * spec_length, 1)
+    if (temperatures == 0).all() and self.return_pdfs == False:
+        return QEffCausalLMOutputWithPast(
+            loss=None,
+            logits=None,
+            probs=None,
+            next_tokens=greedy_samples.reshape(-1, spec_length, 1),  # Return sampled next tokens instead of logits
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            past_repetition_penalty_buffer=past_repetition_penalty_buffer,
+            past_presence_penalty_buffer=past_presence_penalty_buffer,
+        )
+
     # Repetition Penalty
     if (repetition_penalties != 1.).any():
         repetition_penalties = repetition_penalties.repeat(spec_length, vocab_size)  # (batch_size, 1) -> (batch_size * spec_length, vocab_size)
         past_repetition_penalty_buffer_selected = past_repetition_penalty_buffer_selected.repeat(spec_length, 1)  # (batch_size, vocab_size) -> (batch_size * spec_length, vocab_size)
-        repetition_penalties[~past_repetition_penalty_buffer_selected.bool()] = 1.0
+        repetition_penalties[past_repetition_penalty_buffer_selected == 0] = 1.0
         logits = torch.where(logits > 0, logits / repetition_penalties, logits * repetition_penalties)
 
     # Presence Penalty
@@ -252,12 +270,11 @@ def sampler_forward(
     # TODO: Frequency Penalty
 
     # Temperature Scaling
-    if (temperatures != 0).any():
-        temperatures = temperatures.repeat(spec_length, 1)  # (batch_size, 1) -> (batch_size * spec_length, 1)
-        logits = torch.where(temperatures != 0, logits / temperatures, logits)
-
+    temperatures = temperatures.repeat(spec_length, 1)  # (batch_size, 1) -> (batch_size * spec_length, 1)
+    logits /= temperatures
+    
     # Top K
-    # TODO (Optimization): if (top_ks != -1 or top_ks != Constants.MAX_TOP_K_IDS).any(): skip
+    # TODO (Optimization): if (top_ks != -1 or top_ks != Constants.MAX_TOP_K_IDS).any() is False: skip but will need topk_values_asc and topk_indices_asc
     topk_values, topk_indices = torch.topk(logits, k=Constants.MAX_TOP_K_IDS, dim=1)  # (batch_size * spec_length, vocab_size)
     topk_values_asc = topk_values.flip(dims=[1])
     topk_indices_asc = topk_indices.flip(dims=[1])
@@ -267,42 +284,41 @@ def sampler_forward(
     topk_values_asc[topk_mask] = torch.finfo(torch.float16).min
 
     # Top P
-    # TODO (Optimization): if (top_ps != 1.).any(): skip but will need top_probs for Min P
+    # TODO (Optimization): if (top_ps != 1.).any() is False: skip but will need top_probs for Min P
     top_probs = torch.softmax(topk_values_asc, dim=1)  # (batch_size * spec_length, Constants.MAX_TOP_K_IDS)
     topk_probs_sum = torch.cumsum(top_probs, dim=1)
     top_p_mask = topk_probs_sum <= 1 - top_ps.repeat(spec_length, 1)  # (batch_size * spec_length, Constants.MAX_TOP_K_IDS)
     top_p_mask[:, Constants.MAX_TOP_K_IDS - 1] = False
     topk_values_asc[top_p_mask] = torch.finfo(torch.float16).min
 
     # Min P
-    # TODO (Optimization): if (min_ps != 0.).any(): skip
-    scaled_min_p = torch.mul(min_ps.repeat(spec_length, 1), top_probs[:, -1:])  # (batch_size * spec_length, 1)
-    min_p_mask = top_probs < scaled_min_p  # (batch_size * spec_length, Constants.MAX_TOP_K_IDS)
-    topk_values_asc[min_p_mask] = torch.finfo(torch.float16).min
-
-    logits.fill_(torch.finfo(torch.float16).min)
-    logits = logits.scatter(1, topk_indices_asc, topk_values_asc)  # (batch_size * spec_length, vocab_size)
-
-    # Softmax
-    # TODO (Optimization): if (temperatures == 0).all(): skip and perform only greedy sampling
-    probs = torch.softmax(logits, dim=1)  # (batch_size * spec_length, vocab_size)
-
-    # Sample the next tokens
-    # TODO (Optimization): if self.return_pds: skip
-    greedy_samples = torch.argmax(probs, dim=-1, keepdim=True)  # Greedy Sampling
+    if (min_ps != 0.).any():
+        scaled_min_p = torch.mul(min_ps.repeat(spec_length, 1), top_probs[:, Constants.MAX_TOP_K_IDS - 1:])  # (batch_size * spec_length, 1)
+        min_p_mask = top_probs < scaled_min_p  # (batch_size * spec_length, Constants.MAX_TOP_K_IDS)
+        topk_values_asc[min_p_mask] = torch.finfo(torch.float16).min
+
+    probs = None
+    if self.return_pdfs:
+        # Update the logits
+        logits.fill_(torch.finfo(torch.float16).min)
+        logits = logits.scatter(1, topk_indices_asc, topk_values_asc)  # (batch_size * spec_length, vocab_size)
+        # Softmax
+        probs = torch.softmax(logits, dim=1).reshape(-1, spec_length, vocab_size)  # (batch_size, spec_length, vocab_size)
+        
+    # Random Sampling
+    topk_probs_asc = torch.softmax(topk_values_asc, dim=1)  # (batch_size * spec_length, Constants.MAX_TOP_K_IDS)
     gumbel_noise = -torch.log(-torch.log(random_numbers.repeat(spec_length, 1)))  # Gumbel-Max Trick
-    y = probs + gumbel_noise
-    random_samples = torch.argmax(y, dim=-1, keepdim=True)  # Random Sampling
-    next_tokens = torch.where(temperatures == 0, greedy_samples, random_samples)  # (batch_size * spec_length, 1)
+    y = topk_probs_asc + gumbel_noise  
+    random_samples_indices = torch.argmax(y, dim=1, keepdim=True) 
+    random_samples = torch.gather(topk_indices_asc, 1, random_samples_indices)  # (batch_size * spec_length, 1)
 
-    # Reshape tensor back to 3D
-    probs = probs.reshape(-1, spec_length, vocab_size)
-    next_tokens = next_tokens.reshape(-1, spec_length, 1)
+    # Sample the next tokens
+    next_tokens = torch.where(temperatures == 0, greedy_samples, random_samples).reshape(-1, spec_length, 1)  # (batch_size, spec_length, 1)
 
     return QEffCausalLMOutputWithPast(
         loss=None,
         logits=None,  
-        probs=probs if self.return_pdfs else None,  # Return probabilities instead of logits
+        probs=probs,
         next_tokens=next_tokens,  # Return sampled next tokens instead of logits
         past_key_values=outputs.past_key_values,
         hidden_states=outputs.hidden_states,

Original file line number	Diff line number	Diff line change
`@@ -246,8 +246,8 @@`
`246`	`246`	`QEffWhisperModel,`
`247`	`247`	`QEffWhisperPositionalEmbedding,`
`248`	`248`	`)`
`249`		`-from QEfficient.transformers.spd.causal_lm_forward import tlm_forward`
`250`	`249`	`from QEfficient.transformers.sampler.sampler import sampler_forward`
	`250`	`+from QEfficient.transformers.spd.causal_lm_forward import tlm_forward`
`251`	`251`
`252`	`252`	`class CustomOpsTransform(ModuleMappingTransform):`
`253`	`253`	`_module_mapping = {`