Skip to content

Commit 215391f

Browse files
committed
[bugfix] the problem of hccl_buffsize configuration.
Signed-off-by: mojave2 <[email protected]>
1 parent cd58a64 commit 215391f

File tree

2 files changed

+39
-11
lines changed

2 files changed

+39
-11
lines changed

tests/ut/test_utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
import math
1717
import os
1818
from threading import Lock
19+
from types import SimpleNamespace
1920
from unittest import mock
21+
from unittest.mock import patch
2022

2123
import torch
2224
from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig,
@@ -171,6 +173,30 @@ def test_current_stream(self):
171173
with mock.patch("torch.npu.current_stream") as mock_current_stream:
172174
self.assertEqual(utils.current_stream(), mock_current_stream())
173175

176+
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
177+
@patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options")
178+
def test_create_hccl_pg_options_ep_with_env(self, mock_options):
179+
mock_options.return_value = SimpleNamespace(hccl_config=None)
180+
181+
options = utils.create_hccl_pg_options("ep")
182+
183+
mock_options.assert_called_once_with()
184+
self.assertIsNotNone(options.hccl_config)
185+
self.assertEqual(options.hccl_config["hccl_buffer_size"], 1024)
186+
187+
@patch.dict(os.environ, {}, clear=False)
188+
@patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options")
189+
def test_create_hccl_pg_options_ep_default(self, mock_options):
190+
os.environ.pop("HCCL_BUFFSIZE", None)
191+
mock_options.return_value = SimpleNamespace(hccl_config=None)
192+
193+
options = utils.create_hccl_pg_options("ep")
194+
195+
mock_options.assert_called_once_with()
196+
self.assertIsNotNone(options.hccl_config)
197+
self.assertEqual(options.hccl_config["hccl_buffer_size"],
198+
utils._DEFAULT_BUFFER_SIZE)
199+
174200
def test_vllm_version_is(self):
175201
with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):
176202
with mock.patch("vllm.__version__", "1.0.0"):

vllm_ascend/utils.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -715,12 +715,11 @@ def npu_stream_switch(target_stream: torch.npu.Stream,
715715
def create_hccl_pg_options(group_name: str):
716716
options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
717717
hccl_config = get_hccl_config_for_pg_options(group_name)
718-
if hccl_config is not None:
719-
options.hccl_config = hccl_config
718+
options.hccl_config = hccl_config
720719
return options
721720

722721

723-
def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]:
722+
def get_hccl_config_for_pg_options(group_name: str) -> dict:
724723
"""
725724
Get HCCL process group options for the given communication group name.
726725
@@ -730,21 +729,24 @@ def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]:
730729
Returns:
731730
HCCL pg_options or None for mc2 group
732731
"""
733-
# FIXME: Current mc2 operators only perform communication space partitioning
734-
# based on HCCL_BUFFSIZE configuration. Using pg_options with mc2 group would
735-
# result in memory misalignment problems.
736-
if group_name and "mc2" in group_name:
737-
return None
738732
hccl_config_map = {
739733
"dp": {
740734
"hccl_buffer_size": calculate_dp_buffer_size()
741735
},
742736
}
743-
return hccl_config_map.get(group_name, get_default_buffer_config())
737+
return hccl_config_map.get(group_name, get_buffer_config())
744738

745739

746-
def get_default_buffer_config() -> dict:
747-
return {"hccl_buffer_size": _DEFAULT_BUFFER_SIZE}
740+
def get_buffer_config() -> dict:
741+
env_buffer_size = os.getenv("HCCL_BUFFSIZE")
742+
if env_buffer_size is not None:
743+
try:
744+
buffer_size = int(env_buffer_size)
745+
except ValueError:
746+
buffer_size = _DEFAULT_BUFFER_SIZE
747+
else:
748+
buffer_size = _DEFAULT_BUFFER_SIZE
749+
return {"hccl_buffer_size": buffer_size}
748750

749751

750752
def calculate_dp_buffer_size() -> int:

0 commit comments

Comments
 (0)