[SWDEV-523736] Skip&Fix some testcases for Navi4x

k-artem · k-artem · commit 24a6c29f1db7 · 2025-07-27T14:00:35.000Z
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1048,9 +1048,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
   TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
   TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+  #ifndef USE_ROCM
   // Type restrictions imposed by CuBLASLt as of CUDA-12.1
   TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
         "Multiplication of two Float8_e5m2 matrices is not supported");
+  #endif
   if (bias) {
     TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
     TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
@@ -18,7 +18,7 @@
     get_optimal_checkpointing_policy_per_module,
     sac_milp,
 )
-from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_cuda import TEST_CUDA, PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
@@ -181,7 +181,7 @@ def test_sac_ilp_case1(self):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
-    @skipIfRocmArch(NAVI_ARCH)
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_sac_ilp_case2(self):
         """
         This is a case where the memory budget is not binding, meaning that no
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -27,6 +27,7 @@
     RowwiseParallel,
 )
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -41,6 +42,7 @@
     Transformer,
     with_comms,
 )
+from unittest import skipIf
 
 
 c10d_functional = torch.ops.c10d_functional
@@ -412,6 +414,8 @@ def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         + f"{str(dtype).split('.')[-1]}_"
         + f"thaw_{'__'.join(sorted({n.rpartition('.')[0].replace('.', '_') for n in thaw})) if thaw else 'all'}",
     )
+
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") 
     def test_transformer_req_grad(self, thaw_params, is_seq_parallel, dtype, exp_cnts):
         # Sample a subset of `requires_grad` patterns
 
diff --git a/test/dynamo/test_graph_deduplication.py b/test/dynamo/test_graph_deduplication.py
diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py
@@ -70,8 +70,7 @@ def fn(x, y):
                 torch.rand(10, 10),
                 torch.ones(10, 20),
             ),
-            """[[['y0', 'x0', 'sum_2', 'sum_1', 'z'], \
-['y0_1', 'x0_1', 'sum_4', 'sum_3', 'z_1'], ['y0_2', 'x0_2', 'sum_6', 'sum_5', 'z_2']]]""",
+            """[[['x0', 'y0', 'sum_1', 'sum_2', 'z'], ['x0_1', 'y0_1', 'sum_3', 'sum_4', 'z_1'], ['x0_2', 'y0_2', 'sum_5', 'sum_6', 'z_2']]]""",
         )
 
     def test_get_regions_multiple_region_groups(self):
@@ -104,8 +103,7 @@ def fn(x, y):
                 torch.rand(10, 10),
                 torch.ones(10, 20),
             ),
-            """[[['y1', 'x1', 'sum_3', 'sum_2', 'z'], ['y1_1', 'x1_1', 'sum_5', 'sum_4', 'z_1'], \
-['y1_2', 'x1_2', 'sum_8', 'sum_7', 'z_2']], [['b', 'cos_1', 'sum_1', 'a', 'c'], ['b_1', 'cos_2', 'sum_6', 'a_1', 'c_1']]]""",
+            """[[['x1', 'y1', 'sum_2', 'sum_3', 'z'], ['x1_1', 'y1_1', 'sum_4', 'sum_5', 'z_1'], ['x1_2', 'y1_2', 'sum_7', 'sum_8', 'z_2']], [['a', 'b', 'cos_1', 'sum_1', 'c'], ['a_1', 'b_1', 'cos_2', 'sum_6', 'c_1']]]""",
         )
 
     def test_no_single_node_regions(self):
@@ -177,8 +175,7 @@ def fn(x, y):
                 torch.rand(10, 10),
                 torch.ones(10, 20),
             ),
-            """[[['y1', 'sum_1', 'x1', 'o0'], ['y1_1', 'sum_2', 'x1_1', 'o2'], \
-['y1_2', 'sum_3', 'x1_2', 'o4'], ['y1_3', 'sum_4', 'x1_3', 'o5']]]""",
+            """[[['x1', 'y1', 'sum_1', 'o0'], ['x1_1', 'y1_1', 'sum_2', 'o2'], ['x1_2', 'y1_2', 'sum_3', 'o4'], ['x1_3', 'y1_3', 'sum_4', 'o5']]]""",
         )
 
     def test_nested_args(self):
diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
@@ -12,6 +12,7 @@
 from torch._inductor.codegen.triton import FixedTritonConfig, TritonKernel
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import run_and_get_code
+from torch.testing import assert_close
 from torch.testing._internal.common_cuda import IS_SM89
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -33,19 +34,99 @@ def setUp(self):
         torch._inductor.metrics.generated_kernel_count = 0
         torch._dynamo.reset()
 
-    def run_and_check(self, fn, args, *, expect_kernel_count=1):
-        args_cpu = [tensor.cpu().to(torch.float32) for tensor in args]
-        expected = fn(*args_cpu).to(torch.float16)
-        fn = torch.compile(fn, fullgraph=True)
-        result, (source_code,) = run_and_get_code(fn, *args)
-        self.assertEqual(result, expected)
-        self.assertIn("@triton_heuristics.cooperative_reduction", source_code)
+    def run_and_check(self, fn, args, dtype=None, *, expect_kernel_count=1):
+        # Define fixed tolerances
+        RTOL = 1e-5
+        ATOL = 1e-6
+
+        # calculate reference value in higher precision when input dtype is float16
+        ref_dtype = dtype
+        if dtype == torch.float16:
+            ref_dtype = torch.float64
+
+        # Cast to the determined reference dtype
+        args_ref = [tensor.to(ref_dtype) for tensor in args]
+
+        # Calculate expected output
+        raw_expected = fn(*args_ref)
+
+        if isinstance(raw_expected, (tuple, list)):
+            # If it's a tuple or list, apply .to(dtype) to each tensor within it
+            # Also, handle cases where dtype might not be provided (e.g., for bool reductions)
+            if dtype is not None:
+                expected = type(raw_expected)(
+                    [
+                        t.to(dtype) if isinstance(t, torch.Tensor) else t
+                        for t in raw_expected
+                    ]
+                )
+            else:
+                expected = type(raw_expected)(
+                    [
+                        t.to(torch.float64) if isinstance(t, torch.Tensor) else t
+                        for t in raw_expected
+                    ]
+                )
+        else:
+            # If it's a single tensor
+            if dtype is not None:
+                expected = raw_expected.to(dtype)
+            else:
+                expected = raw_expected.to(torch.float64)
+
+        fn_compiled = torch.compile(fn, fullgraph=True)
+        result, (source_code,) = run_and_get_code(fn_compiled, *args)
+
+        # For comparison, ensure result is also a tuple/list if expected is
+        if isinstance(expected, (tuple, list)):
+            if isinstance(result, torch.Tensor):
+                result = (result,)
+            elif not isinstance(result, type(expected)):
+                result = type(expected)(result)
+
+            if dtype is not None:
+                result = type(result)(
+                    [t.to(dtype) if isinstance(t, torch.Tensor) else t for t in result]
+                )
+            else:
+                result = type(result)(
+                    [
+                        t.to(torch.float64) if isinstance(t, torch.Tensor) else t
+                        for t in result
+                    ]
+                )
+        else:
+            if dtype is not None and isinstance(result, torch.Tensor):
+                result = result.to(dtype)
+            elif isinstance(result, torch.Tensor):
+                result = result.to(torch.float64)
+
+        # Apply assert_close with fixed tolerances for tensor comparisons
+        if isinstance(result, torch.Tensor) and isinstance(expected, torch.Tensor):
+            assert_close(result, expected, rtol=RTOL, atol=ATOL)
+        elif isinstance(result, (tuple, list)) and isinstance(expected, (tuple, list)):
+            # Iterate through elements for comparison
+            for r_item, e_item in zip(result, expected):
+                if isinstance(r_item, torch.Tensor) and isinstance(
+                    e_item, torch.Tensor
+                ):
+                    assert_close(r_item, e_item, rtol=RTOL, atol=ATOL)
+                else:
+                    # Fallback to assertEqual for non-tensor elements (e.g., bool, int)
+                    self.assertEqual(r_item, e_item)
+        else:
+            # Fallback to assertEqual for other types not handled by assert_close
+            self.assertEqual(result, expected)
+
+        if "@triton_heuristics.fixed_config" in source_code:
+            self.assertIn("cooperative_reduction_grid", source_code)
+        else:
+            self.assertIn("@triton_heuristics.cooperative_reduction", source_code)
         if "async_compile.multi_kernel" not in source_code:
             self.assertEqual(
                 torch._inductor.metrics.generated_kernel_count, expect_kernel_count
             )
         return source_code
-
     @parametrize(
         "name",
         [
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -34,6 +34,7 @@
     IS_FBCODE,
     skipIfRocm,
     TEST_WITH_ASAN,
+    xfailIfPy312Plus,
 )
 
 
@@ -1568,6 +1569,7 @@ def get_input() -> torch.Tensor:
         self.assertEqual(result, a + b)
         self.assertIn("znumel", code)
 
+    @xfailIfPy312Plus  # https://github.com/pytorch/pytorch/issues/142032
     def test_repeated_masked_load(self):
         target_size = (8, 2)
         mem_eff_temporal_upsampling_interp_chunks = 2
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
@@ -1332,6 +1332,7 @@ def mask_mod(b, h, q, kv):
         self.assertEqual(query.grad[:, :, M:, :].sum(), 0)
 
     @supported_platform
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_no_mask_vs_sdpa(self):
         score_mod = _generate_windowed(1000)
         attention = functools.partial(flex_attention, score_mod=score_mod)
diff --git a/test/test_license.py b/test/test_license.py
@@ -45,7 +45,11 @@ def test_distinfo_license(self):
                 'Found too many "torch-*dist-info" directories '
                 f'in "{site_packages}, expected only one'
             )
-        with open(os.path.join(os.path.join(distinfo[0], "LICENSE"))) as fid:
+        # setuptools renamed *dist-info/LICENSE to *dist-info/licenses/LICENSE sicne 77.0
+        license_file = os.path.join(distinfo[0], "licenses", "LICENSE")
+        if not os.path.exists(license_file):
+            license_file = os.path.join(distinfo[0], "LICENSE")
+        with open(license_file) as fid:
             txt = fid.read()
             self.assertTrue(starting_txt in txt)
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -19,6 +19,7 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
+     skipIfRocmArch, NAVI4_ARCH,
      setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
@@ -6440,6 +6441,7 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
+    @skipIfRocmArch(NAVI4_ARCH)
     def test_matmul_45724(self, device):
         # https://github.com/pytorch/pytorch/issues/45724
         a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: linear algebra"]
 
+import contextlib
 import unittest
 from itertools import product
 from functools import partial
@@ -351,20 +352,20 @@ def _test_tautological_mm(self, device: str = "cuda",
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_float8_basics(self, device) -> None:
         self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
-        # hipblaslt does not yet support mixed e4m3_type input
-        if torch.version.hip is None:
-            self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
-            self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
         # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
-        with self.assertRaises(RuntimeError):
+        # supported on ROCm but fails on CUDA
+        ctx = self.assertRaises(RuntimeError) if torch.version.hip is None else contextlib.nullcontext()
+        with ctx:
             self._test_tautological_mm(device, e5m2_type, e5m2_type)
 
+        self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
+        self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
+
         self._test_tautological_mm(device, size=64, out_dtype=torch.float16)
         self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
-        # hipblaslt does not yet support bfloat16 output
-        if torch.version.hip is None:
-            self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
-        with self.assertRaises(RuntimeError):
+        self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
+
+        with self.assertRaises(AssertionError if torch.version.hip else RuntimeError):
             self._test_tautological_mm(device, out_dtype=e5m2_type)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
@@ -1695,6 +1695,7 @@ def addmm_kernel_impl(*args, **kwargs):
     "is_bf16_supported",
     "is_current_stream_capturing",
     "is_initialized",
+    "is_tf32_supported",
     "jiterator",
     "list_gpu_processes",
     "make_graphed_callables",