intel · alexbaden · Apr 9, 2025 · Mar 17, 2025 · Mar 19, 2025 · Mar 22, 2025
diff --git a/docs/BLOCK_LOADS_LAYOUT.md b/docs/BLOCK_LOADS_LAYOUT.md
diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py
@@ -6,7 +6,7 @@
 from triton._internal_testing import is_xpu
 
 
-@pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [64, 64], [64, 32], [32, 32]])
+@pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [128, 16], [128, 8], [64, 64], [64, 32], [32, 32]])
 @pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"])
 @pytest.mark.parametrize("transpose", [True, False])
 @pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
@@ -15,6 +15,8 @@
 def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pathlib.Path):
     # modify the layouts to ensure the correct OCL/SPIRV intrinsic is called for each datatype
     if dtype_str == "int8":
+        if M == 128 and N == 16 or N == 8:
+            pytest.skip("TODO: test fails verification")
         A_width = 2
         B_width = 4
         layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 4, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 2], A = [8, 32], B = [32, 32], C = [8, 32]}>"
@@ -23,6 +25,8 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
         B_width = 1
         layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>"
     else:
+        if M == 128 and N == 8:
+            pytest.skip("TODO: test fails verification")
         A_width = 1
         B_width = 2
         layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>"
@@ -73,5 +77,5 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
     kernel = triton.compile(str(temp_file))
 
     kernel[(1, 1, 1)](a, x, b, y)
-
+    #import pdb; pdb.set_trace()
     assert torch.equal(a, x) and torch.equal(b.T if transpose else b, y)
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -60,6 +60,7 @@ class XPUOptions:
     generate_native_code: bool = False
     advanced_path: bool = False
     one_matrix_per_load_for_bt: bool = False
+    enable_tile_load_linear_layout: bool = True
 
     def __post_init__(self):
         default_libdir = Path(__file__).parent / 'lib'
@@ -187,6 +188,7 @@ def parse_target(self, tgt_prop) -> dict:
     def parse_options(self, opts) -> Any:
         args = {k: opts[k] for k in XPUOptions.__dataclass_fields__.keys() if k in opts}
         args["allow_fp8e4nv"] = True
+        args["enable_tile_load_linear_layout"] = os.getenv("TRITON_XPU_ENABLE_TILE_LOAD_LINEAR_LAYOUT", "1") == "1"
         return XPUOptions(**args)
 
     def pack_metadata(self, metadata):
@@ -344,7 +346,8 @@ def make_llir(src, metadata, options):
         # being used, e.g., convert_layout.
         if os.getenv("TRITON_INTEL_REDUCE_TRANSPOSE", "0") != "1":
             passes.ttgpuir.add_allocate_shared_memory(pm)
-        intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt)
+        intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt,
+                                           options.enable_tile_load_linear_layout)
         intel.passes.ttgpuir.add_rewrite_stack_ptr(pm)
         passes.convert.add_arith_to_llvmir(pm)
         passes.common.add_canonicalizer(pm)

diff --git a/third_party/intel/include/TritonIntelGPUToLLVM/Passes.td b/third_party/intel/include/TritonIntelGPUToLLVM/Passes.td
@@ -22,6 +22,9 @@ def ConvertTritonIntelGPUToLLVM
     Option<"oneMatrixPerLoadForBT", "one_matrix_per_load_for_bt",
            "bool", /*default*/"false",
            "Only load one DPAS operands per load for transposed B matrix">,
+    Option<"useTileLoadLinearLayout", "use_tile_load_linear_layout",
+           "bool", /*default*/"true",
+           "Use linear layouts to generate the tile load sizes and offsets">
   ];
 }