gmm tuning v3

Prayer3th · Prayer3th · commit d098bf3674a2 · 2025-09-18T14:59:52.000+08:00
diff --git a/python/sgl_jax/srt/model_executor/model_runner.py b/python/sgl_jax/srt/model_executor/model_runner.py
@@ -379,9 +379,6 @@ def _forward(
         forward_batch: ForwardBatch,
         logits_metadata: LogitsMetadata,
     ):
-        # 预先计算 GMM tiling 参数并设置到 forward_batch 中
-        self._compute_and_set_gmm_tiling(forward_batch)
-
         cache_miss_count = 0
         import jax._src.test_util as jtu
 
@@ -394,67 +391,11 @@ def _forward(
 
         return output, cache_miss_count
 
-    def _compute_and_set_gmm_tiling(self, forward_batch: ForwardBatch):
-        """预先计算 GMM tiling 参数并设置到 forward_batch"""
-        try:
-            # 获取模型配置
-            hidden_size = getattr(self.model_config, "hidden_size", 2048)
-            intermediate_size = getattr(self.model_config, "moe_intermediate_size", 768)
-            num_experts = getattr(self.model_config, "num_experts", 128)
-            num_experts_per_tok = getattr(self.model_config, "num_experts_per_tok", 8)
-
-            # 计算 tiling 参数
-            static_tiling_gate, static_tiling_down = self.compute_gmm_tiling_for_batch(
-                forward_batch,
-                hidden_size,
-                intermediate_size,
-                num_experts,
-                num_experts_per_tok,
-            )
-
-            # 设置到 forward_batch
-            forward_batch.static_tiling_gate = static_tiling_gate
-            forward_batch.static_tiling_down = static_tiling_down
-
-        except Exception as e:
-            # 出现任何错误时使用默认值
-            forward_batch.static_tiling_gate = (512, 1024, 1024)
-            forward_batch.static_tiling_down = (512, 1024, 1024)
-
     def _set_kv_cache_after_forward(self, layers_kv_fused, forward_batch: ForwardBatch):
         start_idx = forward_batch.token_to_kv_pool.start_layer
         end_idx = start_idx + len(layers_kv_fused)
         forward_batch.token_to_kv_pool.kv_buffer[start_idx:end_idx] = layers_kv_fused
 
-    def compute_gmm_tiling_for_batch(
-        self,
-        forward_batch: ForwardBatch,
-        hidden_size: int,
-        intermediate_size: int,
-        num_experts: int,
-        num_experts_per_tok: int,
-    ) -> Tuple[Tuple[int, int, int], Tuple[int, int, int]]:
-        total_tokens = forward_batch.seq_lens.sum()
-
-        # 计算考虑 expert topk 的实际 m 值
-        m_actual = int(total_tokens * num_experts_per_tok)
-
-        # 构造高效的字符串 key
-        gate_key = f"m{m_actual}_k{hidden_size}_n{intermediate_size}_g{num_experts}"
-        down_key = f"m{m_actual}_k{intermediate_size}_n{hidden_size}_g{num_experts}"
-        logger.info(f"gate_key: {gate_key}, down_key: {down_key}")
-        if forward_batch.gmm_tiling_configs:
-            # 只做精确匹配
-            gate_tiling = forward_batch.gmm_tiling_configs.get(gate_key, None)
-            down_tiling = forward_batch.gmm_tiling_configs.get(down_key, None)
-
-            if gate_tiling and down_tiling:
-                return gate_tiling, down_tiling
-        else:
-            logger.warning("No GMM tiling configs found in forward_batch")
-        logger.warning(f"No GMM tiling found for key: {gate_key} or {down_key}")
-        return (512, 1024, 1024), (512, 1024, 1024)
-
     def forward_idle(
         self,
         forward_batch: ForwardBatch,