Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions .github/workflows/pr-test-npu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,33 @@ jobs:
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py

- name: Run test deepep eplb
- name: Run test base fused deep moe
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --num-experts 256
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 128
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 256

- name: Run test muti-model for fused deep moe
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --num-experts 256 --hidden 6144 --moe-intermediate-size 2048
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 128 --hidden 6144 --moe-intermediate-size 2048
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --num-experts 256 --hidden 4096 --moe-intermediate-size 1536
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 256 --hidden 4096 --moe-intermediate-size 1536
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 128 --hidden 4096 --moe-intermediate-size 1536

- name: Run test fused deepep moe eplb
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --topk-drop-col 3
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --topk-drop-prob 0.3
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --topk-drop-col 2
Expand Down Expand Up @@ -127,13 +147,33 @@ jobs:
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py

- name: Run test deepep eplb
- name: Run test base fused deep moe
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --num-experts 256
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 128
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 256

- name: Run test muti-model for fused deep moe
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --num-experts 256 --hidden 6144 --moe-intermediate-size 2048
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 128 --hidden 6144 --moe-intermediate-size 2048
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --num-experts 256 --hidden 4096 --moe-intermediate-size 1536
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 256 --hidden 4096 --moe-intermediate-size 1536
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 48 --num-experts 128 --hidden 4096 --moe-intermediate-size 1536

- name: Run test fused deepep moe eplb
timeout-minutes: 10
env:
HCCL_BUFFSIZE: 2000
run: |
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --topk-drop-col 3
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --topk-drop-prob 0.3
python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --topk-drop-col 2
Expand Down
36 changes: 27 additions & 9 deletions tests/python/deepep/test_fused_deep_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,31 @@

torch_npu.npu.config.allow_internal_format = True

GMM_TILE_N_DIM = 64


# ======================== Weight Initialization ========================
def init_base_weights(
num_local_experts, hidden_in=7168, hidden_mid=4096, hidden_out=2048
num_local_experts,
hidden_in=7168,
moe_intermediate_size=4096,
):
"""
Initialize the weights for each local expert.
`num_local_experts`: Number of experts per rank = `num_experts` // `num_ranks`
`hidden_in`: Input dimension (default 7168)
`hidden_mid`: Intermediate layer dimension (default 4096)
`hidden_out`: Output dimension (default 2048)
`moe_intermediate_size`: Intermediate moe layer dimension (default 4096)
"""

hidden_out = moe_intermediate_size // 2
w13_weight = torch.randint(
-16, 16, [num_local_experts, hidden_mid, hidden_in], dtype=torch.int8
-16, 16, [num_local_experts, moe_intermediate_size, hidden_in], dtype=torch.int8
)
w2_weight = torch.randint(
-16, 16, [num_local_experts, hidden_in, hidden_out], dtype=torch.int8
)

w13_weight_scale = (
torch.rand([num_local_experts, hidden_mid, 1]) * 0.0004 + 0.0015
torch.rand([num_local_experts, moe_intermediate_size, 1]) * 0.0004 + 0.0015
).bfloat16()
w2_weight_scale = (
torch.rand([num_local_experts, hidden_in, 1]) * 0.0004 + 0.0015
Expand Down Expand Up @@ -74,7 +77,9 @@ def reshape_fusion_gmm_weight(weight, dim):
if dim < 0:
dim += len(original_shape)

weight = weight.view(*original_shape[:dim], 2, 32, 64, *original_shape[dim + 1 :])
weight = weight.view(
*original_shape[:dim], 2, -1, GMM_TILE_N_DIM, *original_shape[dim + 1 :]
)
weight = weight.transpose(dim, dim + 1).contiguous()
weight = weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :])

Expand Down Expand Up @@ -232,6 +237,7 @@ def baseline_test(
def test(
num_tokens: int,
hidden: int,
moe_intermediate_size: int,
num_experts: int,
num_topk: int,
rank: int,
Expand Down Expand Up @@ -310,6 +316,7 @@ def test(
w13_weight, w13_weight_scale, w2_weight, w2_weight_scale = init_base_weights(
num_local_experts=num_local_experts,
hidden_in=hidden,
moe_intermediate_size=moe_intermediate_size,
)
w13, w13_scale, w2, w2_scale = init_baseline_weights(
w13_weight.clone().detach(),
Expand Down Expand Up @@ -459,7 +466,7 @@ def test(
flush=True,
)

assert avg_diff < 1e-4, f"[Rank {rank}] Mismatch detected! diff={avg_diff}"
assert avg_diff < 4e-4, f"[Rank {rank}] Mismatch detected! diff={avg_diff}"

# ----- Compare Recv Count -----
all_expert_token_counts = [
Expand Down Expand Up @@ -576,7 +583,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
group2 = dist.new_group(list(range(num_ranks)))

shared_expert_rank_num = int(os.getenv("MOE_SHARED_EXPERT_RANK_NUM", 0))
num_tokens, hidden = args.num_tokens, args.hidden
num_tokens, hidden, moe_intermediate_size = (
args.num_tokens,
args.hidden,
args.moe_intermediate_size,
)
num_topk, num_experts = args.num_topk, args.num_experts
use_experts = num_experts if shared_expert_rank_num == 0 else (num_experts - 1)
use_ranks = num_ranks - shared_expert_rank_num
Expand All @@ -599,6 +610,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
test(
num_tokens,
hidden,
moe_intermediate_size,
use_experts,
num_topk,
rank,
Expand Down Expand Up @@ -637,6 +649,12 @@ def str_to_bool(value):
parser.add_argument(
"--hidden", type=int, default=7168, help="Hidden dimension size (default: 7168)"
)
parser.add_argument(
"--moe-intermediate-size",
type=int,
default=4096,
help="Moe intermediate size (default: 4096)",
)
parser.add_argument(
"--num-topk", type=int, default=8, help="Number of top-k experts (default: 8)"
)
Expand Down