revert masking_utils modification

wcrzlh · wcrzlh · commit 24debc8bef93 · 2025-11-04T15:20:25.000+08:00
diff --git a/mindone/transformers/masking_utils.py b/mindone/transformers/masking_utils.py
@@ -304,20 +304,22 @@ def sdpa_mask_recent_torch(
 
     # Similar to `kv_arange = mint.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
     # but without data-dependent slicing (i.e. torch.compile friendly)
-    kv_arange = mint.arange(kv_length, device=cache_position.device)
+    kv_arange = mint.arange(kv_length)
     kv_arange += kv_offset
 
     # Potentially add the padding 2D mask
     if padding_mask is not None:
         mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
 
-    batch_arange = mint.arange(batch_size, device=cache_position.device)
-    head_arange = mint.arange(1, device=cache_position.device)
+    batch_arange = mint.arange(batch_size)
+    head_arange = mint.arange(1)
     # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
     # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
     # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
     # with TransformGetItemToIndex():
-    causal_mask = _vmap_for_bhqkv(mask_function)(batch_arange, head_arange, cache_position, kv_arange)
+    # TODO There is a compile problem if using 'mindspore.vmap', we abandon this operator and generate 2D mask --> 4D mask step by step
+    causal_mask = mask_function()(batch_arange, head_arange, cache_position, kv_arange)
+    causal_mask = causal_mask[None, None, :, :].broadcast_to((batch_size, -1, -1, -1))
 
     return causal_mask
 
@@ -383,7 +385,8 @@ def sdpa_mask_older_torch(
     # as vmap cannot handle slicing a tensor from scalar tensor (it internally calls `.item()` which vmap does not allow
     # However, in more recent version of Pytorch, a trick was introduced to handle it - which is the reason we have
     # `sdpa_mask_recent_torch`, as it allows more general `mask_function`
-    causal_mask = _vmap_for_bhqkv(mask_function, bh_indices=False)(None, None, cache_position, kv_arange)
+    # TODO There is a compile problem if using 'mindspore.vmap', we abandon this operator and generate 2D mask --> 4D mask step by step
+    causal_mask = mask_function()(None, None, cache_position, kv_arange)
     causal_mask = causal_mask[None, None, :, :].broadcast_to((batch_size, -1, -1, -1))
     if padding_mask is not None:
         causal_mask = causal_mask * padding_mask[:, None, None, :]
@@ -436,7 +439,8 @@ def _ignore_causal_mask_sdpa(
 
 # We use the version with newer torch whenever possible, as it is more general and can handle arbitrary mask functions
 # (especially mask_function indexing a tensor, such as the padding mask function)
-sdpa_mask = sdpa_mask_older_torch  # TODO: use sdpa_mask_recent_torch orsdpa_mask_older_torch?
+# TODO we do not go through older sdpa func like transformers setting, default setting is set to `sdpa_mask_recent_torch`
+sdpa_mask = sdpa_mask_recent_torch
 
 
 def eager_mask(
@@ -669,6 +673,8 @@ def create_causal_mask(
     cache_position: ms.Tensor,
     past_key_values: Optional[Cache],
     position_ids: Optional[ms.Tensor] = None,
+    or_mask_function: Optional[Callable] = None,
+    and_mask_function: Optional[Callable] = None,
 ) -> Optional[Union[ms.Tensor, BlockMask]]:
     """
     Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
@@ -717,18 +723,17 @@ def create_causal_mask(
     # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
     allow_is_causal_skip = not past_key_values.is_compileable if past_key_values is not None else True
 
-    # TODO there is a compile problem during and_masks/or_masks func used as mask_factory_function, Comment this part firstly
-    # # If we detected packing format
-    # if packed_sequence_mask is not None:
-    #     mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
-    #     allow_is_causal_skip = False
-    # # Allow slight deviations from causal mask
-    # if or_mask_function is not None:
-    #     mask_factory_function = or_masks(mask_factory_function, or_mask_function)
-    #     allow_is_causal_skip = False
-    # if and_mask_function is not None:
-    #     mask_factory_function = and_masks(mask_factory_function, and_mask_function)
-    #     allow_is_causal_skip = False
+    # If we detected packing format
+    if packed_sequence_mask is not None:
+        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
+        allow_is_causal_skip = False
+    # Allow slight deviations from causal mask
+    if or_mask_function is not None:
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+    if and_mask_function is not None:
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
 
     # We now create the mask
     causal_mask = mask_interface(
@@ -806,18 +811,17 @@ def create_sliding_window_causal_mask(
     # Do not allow skip if we are compiling (this is to match BC)
     # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
     allow_is_causal_skip = not past_key_values.is_compileable if past_key_values is not None else True
-    # # TODO there is a compile problem during and_masks/or_masks func used as mask_factory_function, Comment this part firstly
-    # # If we detected packing format
-    # if packed_sequence_mask is not None:
-    #     mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
-    #     allow_is_causal_skip = False
-    # # Allow slight deviations from sliding causal mask
-    # if or_mask_function is not None:
-    #     mask_factory_function = or_masks(mask_factory_function, or_mask_function)
-    #     allow_is_causal_skip = False
-    # if and_mask_function is not None:
-    #     mask_factory_function = and_masks(mask_factory_function, and_mask_function)
-    #     allow_is_causal_skip = False
+    # If we detected packing format
+    if packed_sequence_mask is not None:
+        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
+        allow_is_causal_skip = False
+    # Allow slight deviations from sliding causal mask
+    if or_mask_function is not None:
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+    if and_mask_function is not None:
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
 
     # We now create the mask
     causal_mask = mask_interface(