triton-lang · arp2600 · May 14, 2025 · ThomasRaoux · Jun 26, 2025
@@ -253,6 +253,13 @@ LogicalResult createTMADesc(Value tmaPtr, MakeTensorDescOp op,
       getTMABlockShape(encoding, shapePerCTA, /*packedSize=*/false);
   auto contigDimSize = blockShape.back();
 
+  if (contigDimSize * elemSize < 16) {
+    return op->emitError("Descriptor block shape must have at least 16 bytes "
+                         "in the last dimension, but got ")
+           << contigDimSize << " * " << elemSize << " = "
+           << (contigDimSize * elemSize) << " bytes";
+  }
+
   llvm::SmallVector<Value> boxDim;
   if (fp4Padded && contigDimSize != 128) {
     return op->emitError(

@@ -46,7 +46,6 @@
     torch_dtype_name,
     to_numpy,
 )
-from triton.runtime.errors import InterpreterError
 
 
 @contextlib.contextmanager
@@ -5184,35 +5183,20 @@ def kernel():
 
 
 @pytest.mark.interpreter
-def test_tma_load_block_shape_err(device):
+def test_tma_block_shape_err(capfd, device):
 
     @triton.jit
     def kernel(ptr):
         desc = tl.make_tensor_descriptor(ptr, [128, 128], [128, 1], [1, 2])
-        desc.load([0, 0])
+        x = desc.load([0, 0])
+        x = x + x
+        desc.store([0, 0], x)
 
     input = torch.empty((128, 128), dtype=torch.int32, device=device)
-    errc = triton.CompilationError if not is_interpreter() else InterpreterError
-    with pytest.raises(errc) as e:
+    with pytest.raises(RuntimeError) as e:
         kernel[(1, )](input)
-
-    assert "Descriptor block shape must have at least 16 bytes" in str(e.value.__cause__)
-
-
-@pytest.mark.interpreter
-def test_tma_store_block_shape_err(device):
-
-    @triton.jit
-    def kernel(ptr):
-        desc = tl.make_tensor_descriptor(ptr, [128, 128], [128, 1], [8, 4])
-        desc.store([0, 0], tl.zeros([8, 4], dtype=tl.int16))
-
-    input = torch.empty((128, 128), dtype=torch.int16, device=device)
-    errc = triton.CompilationError if not is_interpreter() else InterpreterError
-    with pytest.raises(errc) as e:
-        kernel[(1, )](input)
-
-    assert "Descriptor block shape must have at least 16 bytes" in str(e.value.__cause__)
+    _, stderr = capfd.readouterr()
+    assert "Descriptor block shape must have at least 16 bytes" in stderr
 
 
 def test_trans_reshape(device, with_allocator):

@@ -1858,13 +1858,6 @@ def make_tensor_descriptor(
             raise ValueError(f"Expected {ndim} strides but got {len(strides)}")
         if len(block_shape) != ndim:
             raise ValueError(f"Expected block_shape to have {ndim} dimensions but got {len(strides)}")
-        assert isinstance(base.dtype, tl.pointer_type)
-        elem_size = base.dtype.element_ty.primitive_bitwidth // 8
-        contig_dim_size = tl._unwrap_if_constexpr(block_shape[-1])
-        if contig_dim_size * elem_size < 16:
-            raise ValueError(
-                f"Descriptor block shape must have at least 16 bytes in the last dimension, but got {contig_dim_size} * {elem_size} = {contig_dim_size * elem_size} bytes"
-            )
 
         strides[-1] = tl._unwrap_if_constexpr(strides[-1])
         if strides[-1] != 1: