[release/2.7] Skip&Fix some testcases for Navi4x (#2645)

k-artem · pragupta · web-flow · commit 9015dfdf991a · 2025-10-01T13:33:46.000+02:00
Manual cherry-pick of #2401 Fixes #SWDEV-548314 --------- Co-authored-by: Prachi Gupta <pracgupt@amd.com>
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
 # PyTorch forward-mode is not mature yet
-from functorch import functionalize
+from torch._functorch.deprecated import functionalize
 from torch._functorch.apis import chunk_vmap
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
 from torch._functorch.eager_transforms import hessian, jacfwd, jvp
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -27,7 +27,10 @@
 )
 from torch.distributed.tensor import DTensor, init_device_mesh, Shard
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+    TEST_CUDA,
+)
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
@@ -41,7 +44,9 @@
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
+    NAVI4_ARCH,
     run_tests,
+    skipIfRocmArch,
     wrapSwapTensorsTest,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -94,6 +99,7 @@ def world_size(self) -> int:
         return 4
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releaes
     def test_param_registration_after_forward(self):
         """Tests the parameter registration after forward."""
         device = torch.device("cuda", 0)
@@ -200,6 +206,7 @@ def world_size(self) -> int:
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     @wrapSwapTensorsTest(True)
+    @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releaes
     def test_to_float64_after_init(self):
         """Tests that the user can cast the module to float64 after init."""
         # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
@@ -310,6 +317,9 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
 
     @skip_if_lt_x_gpu(2)
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA"
+    )
     def test_train_parity_multi_group(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
@@ -17,7 +17,7 @@
     get_optimal_checkpointing_policy_per_module,
     sac_milp,
 )
-from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_cuda import TEST_CUDA, PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
@@ -180,7 +180,7 @@ def test_sac_ilp_case1(self):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
-    @skipIfRocmArch(NAVI_ARCH)
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_sac_ilp_case2(self):
         """
         This is a case where the memory budget is not binding, meaning that no
diff --git a/test/distributed/elastic/test_control_plane.py b/test/distributed/elastic/test_control_plane.py
@@ -15,7 +15,12 @@
     TORCH_WORKER_SERVER_SOCKET,
     worker_main,
 )
-from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    requires_cuda,
+    run_tests,
+    skipIfRocm,
+    TestCase,
+)
 
 
 class UnixHTTPConnection(HTTPConnection):
@@ -151,6 +156,7 @@ def test_dump_nccl_trace_pickle_with_json(self) -> None:
             )
             self.assertEqual(resp.status, 200)
 
+    @skipIfRocm  # skipped upstream too
     def test_tcp(self) -> None:
         import requests
 
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
@@ -35,8 +35,11 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    NAVI4_ARCH,
     parametrize,
     run_tests,
+    skipIfRocmArch,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -160,6 +163,7 @@ def test_nested_always_wrap_model(
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
+    @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releases
     def test_transformer(
         self,
         cpu_offload: CPUOffload,
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -6,6 +6,7 @@
 from enum import auto, Enum
 from functools import partial
 from typing import Optional
+import unittest
 
 import torch
 import torch.distributed as dist
@@ -31,6 +32,9 @@
     FSDPTest,
     TransformerWithSharedParams,
 )
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     run_tests,
@@ -227,6 +231,7 @@ def test_invalid_pg_specification_raises(self):
     # resharded after forward.
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention")
     def test_fsdp_hybrid_shard_basic_setup(self):
         """
         Tests basic functionality of HYBRID_SHARD and _HYBRID_SHARD_ZERO2:
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -19,6 +19,7 @@
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     DEVICEInitMode,
@@ -236,6 +237,9 @@ def _build_model_and_optim(
         return model, optim, ref_model, ref_optim
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA"
+    )
     def test_sharded_grad_scaler_found_inf(self):
         self.run_subtests(
             {
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -917,6 +917,8 @@ def closure_sharded(input_tensor=input_tensor):
                 torch.testing.assert_close(
                     loss_ddp,
                     loss_sharded_optim,
+                    atol=1.6e-3,
+                    rtol=3e-6,
                     msg="Losses differ between local optimizer and ZeRO",
                 )
                 self._check_same_model_params(
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -43,6 +43,7 @@
     Transformer,
     with_comms,
 )
+from unittest import skipIf
 
 
 c10d_functional = torch.ops.c10d_functional
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -20,6 +20,7 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
+     skipIfRocmArch, NAVI4_ARCH,
      setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
      runOnRocmArch, MI300_ARCH)
 from torch.testing._internal.common_device_type import \
@@ -7149,6 +7150,7 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
+    @skipIfRocmArch(NAVI4_ARCH)
     def test_matmul_45724(self, device):
         # https://github.com/pytorch/pytorch/issues/45724
         a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)

Original file line number	Diff line number	Diff line change
`@@ -917,6 +917,8 @@ def closure_sharded(input_tensor=input_tensor):`
`917`	`917`	`torch.testing.assert_close(`
`918`	`918`	`loss_ddp,`
`919`	`919`	`loss_sharded_optim,`
	`920`	`+ atol=1.6e-3,`
	`921`	`+ rtol=3e-6,`
`920`	`922`	`msg="Losses differ between local optimizer and ZeRO",`
`921`	`923`	`)`
`922`	`924`	`self._check_same_model_params(`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@`
`43`	`43`	`Transformer,`
`44`	`44`	`with_comms,`
`45`	`45`	`)`
	`46`	`+from unittest import skipIf`
`46`	`47`
`47`	`48`
`48`	`49`	`c10d_functional = torch.ops.c10d_functional`