[TEST] Enable test_matmul::test_matmul

AndreyPavlenko · AndreyPavlenko · commit 39664ab6b100 · 2025-03-14T20:10:03.000+01:00
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -318,7 +318,11 @@ def fp8e8m0_to_float32(scale):
 @pytest.mark.parametrize("nonKDim", ([0, 16, 32] if is_hip_cdna() else [0]))
 def test_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, nonKDim, NUM_WARPS, device):
     if is_xpu():
-        pytest.skip("FIXME: Fail RuntimeError on XPU")
+        if (M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, nonKDim,
+                NUM_WARPS) == (1024, 512, 256, 128, 64, 128, 1, 0,
+                               4) or (M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, nonKDim,
+                                      NUM_WARPS) == (1024, 512, 256, 128, 64, 128, 3, 0, 4):
+            pytest.skip("https://github.com/intel/intel-xpu-backend-for-triton/issues/3677")
     if is_cuda() and torch.cuda.get_device_capability()[0] < 10:
         pytest.skip("Requires compute capability >= 10")
     elif is_hip():
@@ -347,9 +351,17 @@ def test_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, nonKDim, NUM_WARPS
     kernel_kwargs = {}
     if is_hip():
         kernel_kwargs["matrix_instr_nonkdim"] = nonKDim
-    out = mxfp_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, a_scale.stride(0), a.stride(0), a.stride(1),
-                            b.stride(0), b.stride(1), output.stride(0), output.stride(1), BLOCK_M, BLOCK_N, BLOCK_K,
-                            NUM_STAGES=NUM_STAGES, **kernel_kwargs, num_warps=NUM_WARPS)
+
+    try:
+        out = mxfp_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, a_scale.stride(0), a.stride(0), a.stride(1),
+                                b.stride(0), b.stride(1), output.stride(0), output.stride(1), BLOCK_M, BLOCK_N, BLOCK_K,
+                                NUM_STAGES=NUM_STAGES, **kernel_kwargs, num_warps=NUM_WARPS)
+    except triton.runtime.errors.OutOfResources as err:
+        if is_xpu() and err.name == "shared memory":
+            pytest.skip(f"{err}")
+        else:
+            raise err
+
     a_scale_f32 = fp8e8m0_to_float32(a_scale)
     b_scale_f32 = fp8e8m0_to_float32(b_scale)
     a_scale_f32 = a_scale_f32.repeat_interleave(32, dim=1)