Skip to content

Commit 7b41619

Browse files
committed
[bugfix] the hccl_buffsize configuration.
Signed-off-by: mojave2 <[email protected]>
1 parent cd58a64 commit 7b41619

File tree

2 files changed

+53
-11
lines changed

2 files changed

+53
-11
lines changed

tests/ut/test_utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
import math
1717
import os
1818
from threading import Lock
19+
from types import SimpleNamespace
1920
from unittest import mock
21+
from unittest.mock import patch
2022

2123
import torch
2224
from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig,
@@ -170,6 +172,41 @@ def test_find_hccl_library(self):
170172
def test_current_stream(self):
171173
with mock.patch("torch.npu.current_stream") as mock_current_stream:
172174
self.assertEqual(utils.current_stream(), mock_current_stream())
175+
176+
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
177+
@patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options")
178+
def test_create_hccl_pg_options_mc2_with_env(self, mock_options):
179+
mock_options.return_value = SimpleNamespace(hccl_config=None)
180+
181+
options = utils.create_hccl_pg_options("mc2")
182+
183+
mock_options.assert_called_once_with()
184+
self.assertIsNotNone(options.hccl_config)
185+
self.assertEqual(options.hccl_config["hccl_buffer_size"], 2048)
186+
187+
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
188+
@patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options")
189+
def test_create_hccl_pg_options_ep_with_env(self, mock_options):
190+
mock_options.return_value = SimpleNamespace(hccl_config=None)
191+
192+
options = utils.create_hccl_pg_options("ep")
193+
194+
mock_options.assert_called_once_with()
195+
self.assertIsNotNone(options.hccl_config)
196+
self.assertEqual(options.hccl_config["hccl_buffer_size"], 1024)
197+
198+
@patch.dict(os.environ, {}, clear=False)
199+
@patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options")
200+
def test_create_hccl_pg_options_ep_default(self, mock_options):
201+
os.environ.pop("HCCL_BUFFSIZE", None)
202+
mock_options.return_value = SimpleNamespace(hccl_config=None)
203+
204+
options = utils.create_hccl_pg_options("ep")
205+
206+
mock_options.assert_called_once_with()
207+
self.assertIsNotNone(options.hccl_config)
208+
self.assertEqual(options.hccl_config["hccl_buffer_size"],
209+
utils._DEFAULT_BUFFER_SIZE)
173210

174211
def test_vllm_version_is(self):
175212
with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):

vllm_ascend/utils.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -715,12 +715,11 @@ def npu_stream_switch(target_stream: torch.npu.Stream,
715715
def create_hccl_pg_options(group_name: str):
716716
options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
717717
hccl_config = get_hccl_config_for_pg_options(group_name)
718-
if hccl_config is not None:
719-
options.hccl_config = hccl_config
718+
options.hccl_config = hccl_config
720719
return options
721720

722721

723-
def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]:
722+
def get_hccl_config_for_pg_options(group_name: str) -> dict:
724723
"""
725724
Get HCCL process group options for the given communication group name.
726725
@@ -730,21 +729,27 @@ def get_hccl_config_for_pg_options(group_name: str) -> Optional[dict]:
730729
Returns:
731730
HCCL pg_options or None for mc2 group
732731
"""
733-
# FIXME: Current mc2 operators only perform communication space partitioning
734-
# based on HCCL_BUFFSIZE configuration. Using pg_options with mc2 group would
735-
# result in memory misalignment problems.
736-
if group_name and "mc2" in group_name:
737-
return None
738732
hccl_config_map = {
739733
"dp": {
740734
"hccl_buffer_size": calculate_dp_buffer_size()
741735
},
742736
}
743-
return hccl_config_map.get(group_name, get_default_buffer_config())
737+
return hccl_config_map.get(group_name, get_buffer_config())
744738

745739

746-
def get_default_buffer_config() -> dict:
747-
return {"hccl_buffer_size": _DEFAULT_BUFFER_SIZE}
740+
def get_buffer_config() -> dict:
741+
buffer_size = _DEFAULT_BUFFER_SIZE
742+
env_buffer_size = os.getenv("HCCL_BUFFSIZE")
743+
if env_buffer_size is not None:
744+
try:
745+
val = int(env_buffer_size)
746+
if val > 0:
747+
buffer_size = val
748+
except ValueError:
749+
# Fallback to default if HCCL_BUFFSIZE is not a valid integer
750+
pass
751+
752+
return {"hccl_buffer_size": buffer_size}
748753

749754

750755
def calculate_dp_buffer_size() -> int:

0 commit comments

Comments
 (0)