From 810297d367e241183cf46f36a12ca2ee5a28a3ce Mon Sep 17 00:00:00 2001 From: mojave2 Date: Fri, 28 Nov 2025 16:08:12 +0800 Subject: [PATCH] [bugfix] the hccl_buffsize configuration for EP Signed-off-by: mojave2 --- vllm_ascend/utils.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 1564b506e62..1316a6f5d95 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -739,6 +739,9 @@ def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]: "dp": { "hccl_buffer_size": calculate_dp_buffer_size() }, + "ep": { + "hccl_buffer_size": calculate_ep_buffer_size() + }, } return hccl_config_map.get(group_name, get_default_buffer_config()) @@ -760,6 +763,25 @@ def calculate_dp_buffer_size() -> int: return max(dp_buffer_size, _MIN_DP_BUFFER_SIZE) +def calculate_ep_buffer_size() -> int: + """ + formula of ep buffer size: + batch_size * hidden_size * topk * 4 + """ + from vllm.config import get_current_vllm_config + vllm_config = get_current_vllm_config() + hf_config = vllm_config.model_config.hf_config + + hidden_size = hf_config.hidden_size + topk = getattr(hf_config, "num_experts_per_token", 1) + batch_size = vllm_config.scheduler_config.max_num_batched_tokens + int8_size = torch.iinfo(torch.int8).bits // 8 + bf16_size = torch.finfo(torch.bfloat16).bits // 8 + ep_buffer_size = math.ceil((batch_size * hidden_size * topk * + (int8_size * 2 + bf16_size)) / (1024 * 1024)) + return max(ep_buffer_size, _DEFAULT_BUFFER_SIZE) + + # Currently, when in A2, setting the environment variables HCCL_INTRA_PCIE_ENABLE=1 # and HCCL_INTRA_ROCE_ENABLE=0 can reduce cross-machine communication traffic and # significantly improve communication performance of MC2 ops dispatch/combine.