[bugfix] the hccl_buffsize configuration for EP

kiscad · kiscad · commit 810297d367e2 · 2025-11-28T16:08:12.000+08:00
Signed-off-by: mojave2 &lt;chenchen145@huawei.com&gt;
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -739,6 +739,9 @@ def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]:
         "dp": {
             "hccl_buffer_size": calculate_dp_buffer_size()
         },
+        "ep": {
+            "hccl_buffer_size": calculate_ep_buffer_size()
+        },
     }
     return hccl_config_map.get(group_name, get_default_buffer_config())
 
@@ -760,6 +763,25 @@ def calculate_dp_buffer_size() -> int:
     return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE)
 
 
+def calculate_ep_buffer_size() -> int:
+    """
+    formula of ep buffer size:
+    batch_size * hidden_size * topk * 4
+    """
+    from vllm.config import get_current_vllm_config
+    vllm_config = get_current_vllm_config()
+    hf_config = vllm_config.model_config.hf_config
+
+    hidden_size = hf_config.hidden_size
+    topk = getattr(hf_config, "num_experts_per_token", 1)
+    batch_size = vllm_config.scheduler_config.max_num_batched_tokens
+    int8_size = torch.iinfo(torch.int8).bits // 8
+    bf16_size = torch.finfo(torch.bfloat16).bits // 8
+    ep_buffer_size = math.ceil((batch_size * hidden_size * topk *
+                                (int8_size * 2 + bf16_size)) / (1024 * 1024))
+    return max(ep_buffer_size, _DEFAULT_BUFFER_SIZE)
+
+
 # Currently, when in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1
 # and HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and
 # significantly improve communication performance of MC2 ops dispatch/combine.