Bug Fix: Optimised PCA-TopK modify_llama code

prajwal1210 · prajwal1210 · commit cdbbe0a3608f · 2024-05-10T16:45:50.000-07:00
diff --git a/methods/pca_topk/cache_utils.py b/methods/pca_topk/cache_utils.py
@@ -178,7 +178,7 @@ def micro_benchmark_pca_topk(cache, prompt_keys, top_r, top_k, num_gen_steps=200
                 #.squeeze(0).squeeze(-1),
                 chunk=256
                 #chunk=min(k2, 65536 // Q.shape[-1]),
-            )
+            ) / math.sqrt(head_dim)
             attn_weights = torch.softmax(attn_weights, dim=-1)
 
             attn_output = G.gather_inner_matrix_only_bmv(
@@ -276,6 +276,6 @@ def benchmark_attention(batch_size=1,
 if __name__ == "__main__":
     #test_pcatopk_cache()
     with torch.no_grad():
-        benchmark_attention(prompt_length=512, num_gen_steps=16, batch_size=128, topk=128)
+        benchmark_attention(prompt_length=4096, num_gen_steps=2000, batch_size=16, topk=1024)
     
 
diff --git a/methods/pca_topk/modify_llama_optimized.py b/methods/pca_topk/modify_llama_optimized.py
@@ -76,9 +76,9 @@ def modified_forward(
 
             # TODO: Keep it fixed or make it dynamic?
             if args.top_k <= 1:
-                args.top_k = int(args.top_k * key_states.shape[-2])
+                self.top_k = int(args.top_k * key_states.shape[-2])
             else:
-                args.top_k = int(args.top_k)
+                self.top_k = int(args.top_k)
 
         key_states = torch.matmul(key_states, self.pca_components)
         query_states = torch.matmul(query_states, self.pca_components)
@@ -96,7 +96,7 @@ def modified_forward(
             # We do not need a causal mask here since this is the generation step
             attn_weights = torch.matmul(query_states[:,:,:,:args.top_r], key_states.transpose(2, 3)[:,:,:args.top_r,:]) / math.sqrt(self.head_dim)
 
-            key_states_topk_indices = torch.topk(attn_weights, args.top_k, dim=-1).indices.to("cuda")
+            key_states_topk_indices = torch.topk(attn_weights, self.top_k, dim=-1).indices.to("cuda")
             key_states_topk_indices , _ = torch.sort(key_states_topk_indices, dim=-1)
             key_states_topk_indices = key_states_topk_indices.reshape(-1, key_states_topk_indices.shape[-1])
 
@@ -109,7 +109,8 @@ def modified_forward(
                 key_states_topk_indices,
                 chunk=256 # Varying this changes performance
                 #chunk=min(k2, 65536 // Q.shape[-1]),
-            )
+            ) / math.sqrt(self.head_dim)
+
             attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
             attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
 
diff --git a/methods/pca_topk/utils.py b/methods/pca_topk/utils.py
@@ -104,7 +104,8 @@ def mask_attn_pca_topk(args, layer_idx, attn_weights, attention_mask, query_stat
 
     # Compute attention with the query_states and key_states_sparse
     attn_weights_s_hat = torch.matmul(query_states_sparse, key_states_sparse.transpose(-1, -2)) / math.sqrt(head_dim)
-    methods.LOGGER.update_config({"scaling_factor": "fixed"})
+    if methods.LOGGER is not None:
+        methods.LOGGER.update_config({"scaling_factor": "fixed"})
     if attention_mask is not None:  # no matter the length, we just slice it
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights_s_hat = attn_weights_s_hat + causal_mask