-
Notifications
You must be signed in to change notification settings - Fork 620
[Feat] Enable mix palacement for DSR1 #4470
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,8 @@ def select_experts(hidden_states: torch.Tensor, | |
| routed_scaling_factor=1.0, | ||
| e_score_correction_bias: Optional[torch.Tensor] = None, | ||
| indices_type: Optional[torch.dtype] = None, | ||
| mix_placement: Optional[bool] = False, | ||
| num_logical_experts: int = -1, | ||
| global_num_experts: int = -1): | ||
| """ | ||
| Fused experts with select experts. | ||
|
|
@@ -87,6 +89,19 @@ def select_experts(hidden_states: torch.Tensor, | |
| e_score_correction_bias=e_score_correction_bias, | ||
| global_num_experts=global_num_experts, | ||
| ) | ||
| if mix_placement: | ||
| pad_shared_expert_ids = torch.full((topk_ids.shape[0], 1), | ||
| num_logical_experts, | ||
| dtype=topk_ids.dtype, | ||
| device=topk_ids.device) | ||
|
|
||
| pad_shared_expert_weights = torch.full((topk_weights.shape[0], 1), | ||
| 0.4, | ||
| dtype=topk_weights.dtype, | ||
| device=topk_weights.device) | ||
|
Comment on lines
+98
to
+101
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The value |
||
| topk_ids = torch.cat([topk_ids, pad_shared_expert_ids], dim=1) | ||
| topk_weights = torch.cat([topk_weights, pad_shared_expert_weights], | ||
| dim=1) | ||
| return topk_weights, topk_ids | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -170,10 +170,10 @@ def __init__(self, *args, **kwargs): | |
| self.moe_config.dp_group = get_dp_group() | ||
| self.moe_config.ep_group = get_ep_group() | ||
| self.moe_config.mc2_group = get_mc2_group() | ||
| ascend_config = get_ascend_config() | ||
| self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path | ||
| self.expert_map_path = ascend_config.expert_map_path | ||
| self.global_redundant_expert_num = ascend_config.init_redundancy_expert | ||
| self.ascend_config = get_ascend_config() | ||
| self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path | ||
| self.expert_map_path = self.ascend_config.expert_map_path | ||
| self.global_redundant_expert_num = self.ascend_config.init_redundancy_expert | ||
| self.global_num_experts = num_experts + self.global_redundant_expert_num | ||
| if self.custom_routing_function is None and self.e_score_correction_bias is not None: | ||
| vllm_config = get_current_vllm_config() | ||
|
|
@@ -190,8 +190,8 @@ def __init__(self, *args, **kwargs): | |
| self.expert_load_balancer = ExpertLoadBalancer( | ||
| self.expert_map_path, num_experts) | ||
| self.expert_load_balancer.check_expert_map_tensor() | ||
| self.global_redundant_expert_num = ( | ||
| self.expert_load_balancer.get_global_redundant_expert_num()) | ||
| # self.global_redundant_expert_num = ( | ||
| # self.expert_load_balancer.get_global_redundant_expert_num()) | ||
|
Comment on lines
+193
to
+194
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unused code should be removed rather than commented out. |
||
| self.global_num_experts = num_experts + self.global_redundant_expert_num | ||
| try: | ||
| self.local_num_experts, self.expert_map = ( | ||
|
|
@@ -248,7 +248,7 @@ def __init__(self, *args, **kwargs): | |
| moe_quant_params["intermediate_size_full"] = intermediate_size | ||
| self.quant_method.create_weights(layer=self, **moe_quant_params) | ||
|
|
||
| self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp | ||
| self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp | ||
|
|
||
| setup_moe_comm_method(self.moe_config) | ||
| self.quant_type = self._get_quant_type() | ||
|
|
@@ -275,7 +275,7 @@ def get_map(self): | |
| return self.expert_map | ||
|
|
||
| def get_log2phy_map(self): | ||
| return self.logical_to_physical_map | ||
| return self.log2phy | ||
|
|
||
| def clear_moe_load(self): | ||
| if self.moe_load is not None: | ||
|
|
@@ -428,8 +428,8 @@ def __init__( | |
| self._shared_experts = shared_experts | ||
| self.use_overlapped = use_overlapped | ||
| self.shared_expert_stream = None | ||
| ascend_config = get_ascend_config() | ||
| self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert | ||
| self.ascend_config = get_ascend_config() | ||
| self.multistream_overlap_shared_expert = self.ascend_config.multistream_overlap_shared_expert | ||
| if enable_sp(): | ||
| logger.info_once( | ||
| "Sequence parallelism is enabled, shared experts are replicated for best performance." | ||
|
|
@@ -457,11 +457,19 @@ def forward( | |
| hidden_states: torch.Tensor, | ||
| router_logits: torch.Tensor, | ||
| ) -> tuple[torch.Tensor, torch.Tensor]: | ||
| shared_out, fused_out = AscendFusedMoE.forward( | ||
| self, | ||
| hidden_states=hidden_states, | ||
| router_logits=router_logits, | ||
| ) | ||
| if self._shared_experts is None: | ||
| fused_out = AscendFusedMoE.forward( | ||
| self, | ||
| hidden_states=hidden_states, | ||
| router_logits=router_logits, | ||
| ) | ||
| shared_out = None | ||
| else: | ||
| shared_out, fused_out = AscendFusedMoE.forward( | ||
| self, | ||
| hidden_states=hidden_states, | ||
| router_logits=router_logits, | ||
| ) | ||
| return shared_out, fused_out | ||
|
|
||
| def forward_impl(self, hidden_states: torch.Tensor, | ||
|
|
@@ -475,7 +483,10 @@ def forward_impl(self, hidden_states: torch.Tensor, | |
| # Use a separate stream to run shared experts. | ||
| # Note that currently we only support calculations in separate streams with aclgraph. | ||
| # Communication operations in another stream might cause unknown errors. | ||
| shared_out = self._shared_experts(hidden_states) | ||
| if self._shared_experts is None: | ||
| shared_out = None | ||
| else: | ||
| shared_out = self._shared_experts(hidden_states) | ||
|
|
||
| fused_output = AscendFusedMoE.forward_impl( | ||
| self, | ||
|
|
@@ -490,6 +501,9 @@ def forward_impl(self, hidden_states: torch.Tensor, | |
| forward_context = get_forward_context() | ||
| moe_comm_type = forward_context.moe_comm_type | ||
| if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2} \ | ||
| and not shared_expert_dp_enabled(): | ||
| and not shared_expert_dp_enabled() and shared_out is not None: | ||
| shared_out = tensor_model_parallel_all_reduce(shared_out) | ||
| return shared_out, fused_output | ||
| if shared_out is None: | ||
| return fused_output | ||
| else: | ||
| return shared_out, fused_output | ||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -112,14 +112,16 @@ def quant_apply_mlp(hidden_states: torch.Tensor, | |||||||
| if quantized_hidden_states is not None: | ||||||||
| dispose_tensor(quantized_hidden_states) | ||||||||
| # act_fn: swiglu | ||||||||
| group_diff = torch.diff(group_list) | ||||||||
| new_group = torch.cat([group_diff[0].unsqueeze(0), group_diff], dim=0) | ||||||||
|
Comment on lines
+115
to
+116
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The calculation of
The correct way to get group sizes from a cumulative sum tensor is to use
Suggested change
|
||||||||
| hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( | ||||||||
| x=hidden_states, | ||||||||
| weight_scale=w1_scale, | ||||||||
| activation_scale=pertoken_scale, | ||||||||
| bias=None, | ||||||||
| quant_scale=None, | ||||||||
| quant_offset=None, | ||||||||
| group_index=group_list, | ||||||||
| group_index=new_group, | ||||||||
| activate_left=True, | ||||||||
| quant_mode=1, | ||||||||
| ) | ||||||||
|
|
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The in-place copy
self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)will raise aRuntimeErrorifupdated_expert_maphas a different shape thanself.expert_map_per_layer_cpu[layer_id]. The logic for paddingupdated_expert_mapfor the device tensorself.expert_map_per_layersuggests that shape mismatches are expected. The CPU-side map should be handled in a way that accommodates shape changes to avoid crashes. Reassigning the tensor, as was done previously, is a safer approach.