Skip to content

[deepseek][blackwell] add manual looping group gemm to enable base working inference on Blackwell #1272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion torchtitan/experiments/deepseek_v3/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def generate(
tokenizer,
dist_config,
messages: list[dict],
n_tokens: int = 200,
n_tokens: int = 50,
):
rank = dist.get_rank()
device = dist_config.device
Expand Down
61 changes: 61 additions & 0 deletions torchtitan/experiments/deepseek_v3/group_gemms.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,70 @@ def is_available() -> bool:
"TorchBF16GroupGEMM",
"TorchAOBF16GroupGEMM",
"TritonCGBF16GroupGEMM",
"ManualLoopGroupGEMM",
]


class ManualLoopGroupGEMM(GroupGEMMStrategy):
"""Manual looping baseline implementation for any arch (esp Blackwell) support"""

def arrange_expert_weights(self, all_weights, submod_name, module):
"""Store weights in a stacked format"""
return torch.stack(all_weights)

def execute(self, contig_tokens, m_sizes, m_offsets, module):
"""Execute using manual loops over experts"""
# Get weights

w_gate = module.get_parameter("gate_proj_weight")
w_up = module.get_parameter("up_proj_weight")
w_down = module.get_parameter("down_proj_weight")

# Prepare output tensor
hidden_size = w_gate.shape[
2
] # stacked weights shape [num_experts, out_dim, in_dim]
output = torch.zeros(
contig_tokens.shape[0],
hidden_size,
dtype=contig_tokens.dtype,
device=contig_tokens.device,
)

# Process each expert sequentially
offset = 0
for expert_idx, size in enumerate(m_sizes):
if size > 0:
# Get tokens for this expert
expert_tokens = contig_tokens[offset : offset + size]

# Get weights for this expert
gate_weight = w_gate[expert_idx] # [out_dim, in_dim]
up_weight = w_up[expert_idx]
down_weight = w_down[expert_idx]

# Forward pass: gate and up projections
gate_out = torch.mm(expert_tokens, gate_weight.t())
up_out = torch.mm(expert_tokens, up_weight.t())

# Apply activation and combine
hidden = self.activation_function(gate_out) * up_out

# Down projection
expert_output = torch.mm(hidden, down_weight.t())

# Store results
output[offset : offset + size] = expert_output

offset += size

return output

@staticmethod
def is_available() -> bool:
return True


class TritonCGBF16GroupGEMM(GroupGEMMStrategy):
"""Implementation of Triton Contiguous group Gemm"""

Expand Down
6 changes: 5 additions & 1 deletion torchtitan/experiments/deepseek_v3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

from group_gemms import (
DSGroupGEMM,
ManualLoopGroupGEMM,
TorchAOBF16GroupGEMM,
TorchBF16GroupGEMM,
TorchFP8GroupGEMM,
Expand Down Expand Up @@ -474,7 +475,7 @@ class MoE(nn.Module):
# Group GEMM strategies
group_gemm_strategies = None
# which group gemm to use?
group_mm = "torch" # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch", , "torchao", "tritoncg"]
group_mm = "manual" # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch", , "torchao", "tritoncg", "manual"]

def __init__(self, config):
super().__init__()
Expand Down Expand Up @@ -527,7 +528,10 @@ def __init__(self, config):
def _initialize_group_gemm_strategies(cls):
"""Initialize available group GEMM strategies"""
cls.group_gemm_strategies = {
# torch._group_MM
"torch": TorchBF16GroupGEMM(MLP.act_fn),
# torch.mm with looping
"manual": ManualLoopGroupGEMM(MLP.act_fn),
"torchao": (
TorchAOBF16GroupGEMM(MLP.act_fn)
if TorchAOBF16GroupGEMM.is_available()
Expand Down
Loading