intel
diff --git a/‎include/cutlass/epilogue/collective/xe_epilogue.hpp‎
Lines changed: 13 additions & 7 deletions b/‎include/cutlass/epilogue/collective/xe_epilogue.hpp‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎python/cutlass_cppgen/backend/evt/passes/util.py‎
Lines changed: 2 additions & 0 deletions b/‎python/cutlass_cppgen/backend/evt/passes/util.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/cutlass_cppgen/backend/gemm_operation.py‎
Lines changed: 11 additions & 5 deletions b/‎python/cutlass_cppgen/backend/gemm_operation.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎python/cutlass_cppgen/backend/library.py‎
Lines changed: 2 additions & 2 deletions b/‎python/cutlass_cppgen/backend/library.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/cutlass_cppgen/backend/utils/device.py‎
Lines changed: 2 additions & 2 deletions b/‎python/cutlass_cppgen/backend/utils/device.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/cutlass_cppgen/library_defaults.py‎
Lines changed: 33 additions & 9 deletions b/‎python/cutlass_cppgen/library_defaults.py‎
Lines changed: 33 additions & 9 deletions
diff --git a/‎python/cutlass_cppgen/utils/check.py‎
Lines changed: 13 additions & 6 deletions b/‎python/cutlass_cppgen/utils/check.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎python/cutlass_library/arch_constants.py‎
Lines changed: 38 additions & 0 deletions b/‎python/cutlass_library/arch_constants.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎python/cutlass_library/gemm_operation.py‎
Lines changed: 50 additions & 12 deletions b/‎python/cutlass_library/gemm_operation.py‎
Lines changed: 50 additions & 12 deletions
@@ -120,10 +120,6 @@ class CollectiveEpilogue<
 
   using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
 
-  using Trait_C = Copy_Traits<GmemTiledCopyC, StrideC>;
-  using val_layout_load_C = decltype(make_layout(shape_div(typename Trait_C::BlockShape{}, CopyThreadShape{})));
-  using XE_Copy_C = decltype(make_tiled_copy(Copy_Atom<Trait_C, ElementC>{}, Layout<CopyThreadShape>{}, val_layout_load_C{}));
-
   using Trait_D = Copy_Traits<GmemTiledCopyD, StrideD>;
   using val_layout_store_D = decltype(make_layout(shape_div(typename Trait_D::BlockShape{}, CopyThreadShape{})));
   using XE_Copy_D = decltype(make_tiled_copy(Copy_Atom<Trait_D, ElementD>{}, Layout<CopyThreadShape>{}, val_layout_store_D{}));
@@ -132,6 +128,13 @@ class CollectiveEpilogue<
   constexpr static bool is_source_supported = not cute::is_void_v<ElementC> && not cute::is_void_v<CopyOpG2R>;
   constexpr static bool is_destination_supported = not cute::is_void_v<ElementD> && not cute::is_void_v<CopyOpR2G>;
 
+  using NonVoidElementC = conditional_t<is_source_supported, ElementC, ElementD>;
+  using Trait_C = Copy_Traits<GmemTiledCopyC, StrideC>;
+  using NonVoidTrait_C = conditional_t<is_source_supported, Trait_C, Trait_D>;
+  using val_layout_load_C = decltype(make_layout(shape_div(typename NonVoidTrait_C::BlockShape{}, CopyThreadShape{})));
+  using NonVoidValLayoutLoad_C = conditional_t<is_source_supported, val_layout_load_C, val_layout_store_D>;
+  using XE_Copy_C = decltype(make_tiled_copy(Copy_Atom<NonVoidTrait_C, NonVoidElementC>{}, Layout<CopyThreadShape>{}, NonVoidValLayoutLoad_C{}));
+
   constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
   constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
 
@@ -348,7 +351,7 @@ class CollectiveEpilogue<
     auto thread_xe_store_d = params.xe_store_d.get_thread_slice(thread_idx);
     Tensor tCgD = thread_xe_store_d.partition_D(gD);
 
-    Tensor trC = make_tensor<ElementC>(Shape<Int<FragmentSize>>{});
+    Tensor trC = make_tensor<NonVoidElementC>(Shape<Int<FragmentSize>>{});
     Tensor trD_compute = make_tensor<ElementCompute>(Shape<Int<FragmentSize>>{});
 
     // Because Sm90 uses shared memory, they are not tied to using the same accumulator values
@@ -407,9 +410,12 @@ class CollectiveEpilogue<
       CUTLASS_PRAGMA_UNROLL
       for (int epi_m = 0; epi_m < FragsM; epi_m++) {
         cst_callbacks.begin_loop(epi_m, epi_n);
-
+        
+        //avoid evaluating xe_load_c when ElementC is void during compilation
         if (is_C_load_needed) {
-          copy(params.xe_load_c, tCgC(_, epi_m, epi_n), trC);
+          if constexpr (is_source_supported) {
+            copy(params.xe_load_c, tCgC(_, epi_m, epi_n), trC);
+          }
         }
 
         cst_callbacks.previsit(epi_m, epi_n, 0, is_C_load_needed);
 
@@ -36,6 +36,8 @@
 
 # Map from the CC of the kernel to the EVT implementation that the CC targets
 cc_map = {
+    12: 12,  # Intel Xe12 PVC
+    20: 20,  # Intel Xe20 BMG
     80: 80,
     86: 80,
     89: 80,
 
@@ -39,6 +39,7 @@
 cuda = lazy_import("cuda.cuda")
 cudart = lazy_import("cuda.cudart")
 from cutlass_library import SubstituteTemplate
+from cutlass_library.arch_constants import is_intel_xe_arch
 import numpy as np
 
 import dpctl
@@ -915,7 +916,7 @@ def get_device_workspace_size(self, arguments):
         return 0
 
     def initialize(self):
-        if self.operation.arch == 11:
+        if is_intel_xe_arch(self.operation.arch):
             return
 
         err, = cuda.cuFuncSetAttribute(
@@ -1318,7 +1319,7 @@ def __init__(self, operation_suffix=""):
 
     def emit(self, operation):
         # Support built-in epilogue functors or user-defined functions
-        if operation.arch == 11:
+        if is_intel_xe_arch(operation.arch):
             stage_count_type = "cutlass::gemm::collective::StageCountAuto"
         elif operation.tile_description.stages is None or operation.tile_description.stages == 0:
             stage_count_type = "cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>"
@@ -1340,7 +1341,7 @@ def emit(self, operation):
         if operation.tile_description.tile_scheduler is not None:
             tschedule = operation.tile_description.tile_scheduler
 
-        arch = "cutlass::arch::IntelXe" if operation.arch == 11 else f"cutlass::arch::Sm{operation.arch}"
+        arch = f"cutlass::arch::Xe{operation.arch}" if is_intel_xe_arch(operation.arch) else f"cutlass::arch::Sm{operation.arch}"
         values = {
             "operation_name": operation.procedural_name(),
             "operation_suffix": self.operation_suffix,
@@ -1718,10 +1719,15 @@ def epilogue_schedule_name_3x(self):
     def procedural_name(self):
         """The full procedural name indicates architecture, extended name, tile size, and layout."""
         opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-        if self.api == ApiVersion.v3x and (self.arch >= 90 or self.arch == 11):
-            kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
+        if self.api == ApiVersion.v3x and (self.arch >= 90 or is_intel_xe_arch(self.arch)):
+            arch_prefix="sm"
+            if is_intel_xe_arch(self.arch):
+                arch_prefix="Xe"
+            
+            kernel_name_template = "cutlass{p}_{sm_or_xe}{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
             return kernel_name_template.format(
                 p=self.prefix,
+                sm_or_xe=arch_prefix,
                 ar=self.arch,
                 op=opcode_class_name,
                 ex=self.extended_name_3x(),
 
@@ -46,7 +46,7 @@
     OpcodeClass,
     TileSchedulerType
 )
-
+from cutlass_library.arch_constants import is_intel_xe_arch
 
 # The following block implements enum.auto() for Python 3.5 variants that don't include it such
 # as the default 3.5.2 on Ubuntu 16.04.
@@ -473,7 +473,7 @@ def api_version(arch, opclass, dtype):
     :return: API version to be used in code emission
     :rtype: ApiVersion
     """
-    if opclass == OpcodeClass.TensorOp and arch == 11:
+    if opclass == OpcodeClass.TensorOp and is_intel_xe_arch(arch):
         return ApiVersion.v3x
 
     if (arch >= 90 and
 
@@ -81,8 +81,8 @@ def device_cc(device: int = -1) -> int:
         device = cutlass_cppgen.device_id()
 
     if cutlass_cppgen._use_sycl:
-        # Using '11' to encode Intel PVC as an integer in the expected format.
-        return 11
+        # Using '12' to encode Intel PVC as an integer in the expected format.
+        return 12
 
     deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
     major = str(deviceProp.major)
 
@@ -40,14 +40,24 @@
 
 import cutlass_library
 from cutlass_library.library import ConvKind, IteratorAlgorithm, StrideSupport, GroupMode
+from cutlass_library.arch_constants import (
+    INTEL_XE_ARCH_MIN, 
+    INTEL_XE_ARCH_MAX, 
+    INTEL_XE12, 
+    INTEL_XE20, 
+    INTEL_XE35,
+    is_intel_xe_arch
+)
 
 import cutlass_cppgen
 from cutlass_cppgen.utils.check import valid_stage_count
 from cutlass_cppgen.utils.datatypes import td_from_profiler_td, td_from_profiler_op
 
 
-# The value '11' is used to encode Intel PVC GPU in the expected format.
-_generator_ccs = [11, 50, 60, 61, 70, 75, 80, 90]
+# Intel Xe architectures and supported NVIDIA architectures  
+# Intel Xe: 12 (PVC/Xe-HPC), 20 (BMG/Xe2), 30 (future)
+# NVIDIA architectures: 50, 60, 61, 70, 75, 80, 90
+_generator_ccs = [INTEL_XE12, INTEL_XE20] #50, 60, 61, 70, 75, 80, 90]
 
 class KernelsForDataType:
     """
@@ -261,7 +271,12 @@ def __init__(
 
         # Identify the method within CUTLASS generator script that generates kernel
         # descriptions for the target CC
-        generate_function_name = "GeneratePVC" if kernel_cc == 11 else "GenerateSM" + str(kernel_cc)
+        # Intel Xe architectures use GenerateIntelXe, NVIDIA uses GenerateSM{cc}
+        if is_intel_xe_arch(kernel_cc):
+            generate_function_name = "GenerateIntelXe"
+        else:
+            generate_function_name = "GenerateSM" + str(kernel_cc)
+        
         if not hasattr(cutlass_library.generator, generate_function_name):
             cutlass_cppgen.logger.warning(f"No generator found for architecture {kernel_cc}")
             return
@@ -273,13 +288,20 @@ def __init__(
             "--kernels=all",
             f"--log-level={logging.getLevelName(cutlass_cppgen.logger.level)}"
         ]
-        if self.cc == 11:
-          args.append("--architectures=11")
+        # For Intel Xe architectures, specify the architecture number
+        if is_intel_xe_arch(kernel_cc):
+            args.append(f"--architectures={kernel_cc}")
 
         manifest_args = cutlass_library.generator.define_parser().parse_args(args)
         manifest = cutlass_library.manifest.Manifest(manifest_args)
-        generate_function(manifest, cutlass_cppgen._nvcc_version)
-
+        
+        # For Intel Xe architectures, pass the architecture number to the generator
+        if is_intel_xe_arch(kernel_cc):
+            print(f"Calling {generate_function_name} with arch={kernel_cc}")
+            generate_function(manifest, cutlass_cppgen._nvcc_version, arch=kernel_cc)
+        else:
+            generate_function(manifest, cutlass_cppgen._nvcc_version)
+        
         if operation_kind not in manifest.operations:
             # No kernels generated for this architecture, this could be because the CUDA
             # toolkit is insufficient to support operations in this CC
@@ -554,8 +576,10 @@ class OptionRegistry:
     def __init__(self, target_cc: int):
         self.registry = {}
 
-        if target_cc > 90:
-            raise Exception(f"Unsupported compute capability {target_cc}. The CUTLASS Python interface only supports compute capabilities up to 90.")
+        # Intel Xe architectures: 12-20 (PVC, BMG, etc.)
+        # NVIDIA architectures: 50-90
+        if target_cc > 90 or (not is_intel_xe_arch(target_cc)):
+            raise Exception(f"Unsupported compute capability {target_cc}. Supported: NVIDIA SM 50-90, Intel Xe 12-20.")
 
         gemm_kinds = [cutlass_library.GemmKind.Universal, cutlass_library.GemmKind.Universal3x]
         operation_kinds = [cutlass_library.OperationKind.Gemm, cutlass_library.OperationKind.Conv2d]
 
@@ -37,7 +37,14 @@
 import ctypes
 
 from cutlass_library import DataTypeSize, OperationKind, SharedMemPerCC
-
+from cutlass_library.arch_constants import (
+    INTEL_XE_ARCH_MIN, 
+    INTEL_XE_ARCH_MAX, 
+    INTEL_XE12, 
+    INTEL_XE20, 
+    INTEL_XE35,
+    is_intel_xe_arch
+)
 import cutlass_cppgen
 from cutlass_cppgen.backend.library import TileDescription
 
@@ -117,16 +124,16 @@ def valid_stage_count(
                 "result in compilation errors if the combination of tile shape, "
                 "stage count, and shared memory requirement of the epilogue exceeds "
                 "the available shared memory per SM.")
-
-    if kernel_cc == 11:
+    print(f"KernelCC: {kernel_cc}")
+    if is_intel_xe_arch(kernel_cc):
         if (td.stages is None or td.stages == 0):
-            # Support for Intel PVC GPU currently does not allow explicit
+            # Support for Intel Xe GPUs currently does not allow explicit
             # specification of the stage count. With None or 0, the 
             # CollectiveBuilder automatically determines the stage count to use.
             return (True, "")
         elif verbose:
-            cutlass.logger.warning(
-                "Setting an explicit stage count for Intel PVC GPU is currently "
+            cutlass_cppgen.logger.warning(
+                "Setting an explicit stage count for Intel Xe GPUs is currently "
                 "not supported.")
 
     if td.stages <= 0:
 
@@ -45,3 +45,41 @@
 CUDA_ARCH_MIN = 50      # Minimum CUDA architecture (sm_50, sm_60, etc.)
 
 ###################################################################################################
+# Specific Intel Xe architecture constants
+###################################################################################################
+# Intel Xe12 - PVC (Ponte Vecchio) HPC architecture
+INTEL_XE12 = 12
+
+# Intel Xe20 - BMG (Battlemage) gaming architecture  
+INTEL_XE20 = 20
+
+# Intel Xe35 - Future architecture placeholder
+INTEL_XE35 = 35
+
+###################################################################################################
+# Architecture validation helpers
+###################################################################################################
+def is_intel_xe_arch(arch):
+    """Check if the given architecture is an Intel Xe architecture."""
+    return INTEL_XE_ARCH_MIN <= arch < INTEL_XE_ARCH_MAX
+
+def is_cuda_arch(arch):
+    """Check if the given architecture is a CUDA architecture."""
+    return arch >= CUDA_ARCH_MIN
+
+def get_arch_name(arch):
+    """Get a human-readable name for the architecture."""
+    if arch == INTEL_XE12:
+        return "Intel Xe12 (PVC)"
+    elif arch == INTEL_XE20:
+        return "Intel Xe20 (BMG)" 
+    elif arch == INTEL_XE35:
+        return "Intel Xe35 (CRI)"
+    elif is_intel_xe_arch(arch):
+        return f"Intel Xe{arch}"
+    elif is_cuda_arch(arch):
+        return f"CUDA SM{arch}"
+    else:
+        return f"Unknown({arch})"
+
+###################################################################################################
@@ -48,10 +48,16 @@
   if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
     raise ImportError("Disabling attempt to import cutlass_library")
   from cutlass_library.library import *
-  from cutlass_library.arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN
+  from cutlass_library.arch_constants import (
+    INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN,
+    INTEL_XE12, INTEL_XE20, INTEL_XE35
+  )
 except ImportError:
   from library import *
-  from arch_constants import INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN
+  from arch_constants import (
+    INTEL_XE_ARCH_MIN, INTEL_XE_ARCH_MAX, CUDA_ARCH_MIN,
+    INTEL_XE12, INTEL_XE20, INTEL_XE35
+  )
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -392,16 +398,48 @@ def _procedural_name(self):
           l = self.layout_name(),
           a = str(max(self.A.alignment, self.B.alignment)))
     else:
-      # Intel Xe architectures use xe{cc} naming (e.g., xe20 for BMG, xe12 for PVC)
-      threadblock = self.tile_description.procedural_name()
-      return "cutlass{p}_xe{ar}_{op}_{ex}_{tb}_{l}_align{a}".format(
-          p = self.prefix,
-          ar = self.arch,
-          op = opcode_class_name,
-          ex = self.extended_name(),
-          tb = threadblock,
-          l = self.layout_name(),
-          a = str(max(self.A.alignment, self.B.alignment)))
+      # Intel Xe architectures use xe{cc} naming with similar detail level as NVIDIA
+      # Format: cutlass{p}_xe{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}
+      if self.is_3x:
+        # Use 3x naming convention with full details like NVIDIA SM90+
+        tile_shape = self.get_collective_tile_shape()
+        extended = self.extended_name_3x()
+        
+        # Add D type suffix if different from C type to distinguish mixed precision variants
+        if self.D.element != self.C.element:
+          extended += f"_d{DataTypeNames[self.D.element]}"
+        
+        kernel_name_template = "cutlass{p}_xe{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}"
+        return kernel_name_template.format(
+            p = self.prefix,
+            ar = self.arch,
+            op = opcode_class_name,
+            ex = extended,
+            ct = '_' + 'x'.join([str(i) for i in tile_shape]) if tile_shape[0] > 0 else "",
+            cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
+            l = self.tile_description.stages,
+            s = self.layout_name_3x(),
+            al = str(max(self.A.alignment, self.B.alignment)),
+            t = TileSchedulerSuffixes[self.tile_scheduler],
+            k = self.kernel_schedule_name_3x(),
+            e = self.epilogue_schedule_name_3x())
+      else:
+        # Legacy naming for non-3x Intel Xe operations
+        threadblock = self.tile_description.procedural_name()
+        extended = self.extended_name()
+        
+        # Add D type suffix if different from C type to distinguish mixed precision variants
+        if self.D.element != self.C.element:
+          extended += f"_d{DataTypeNames[self.D.element]}"
+        
+        return "cutlass{p}_xe{ar}_{op}_{ex}_{tb}_{l}_align{a}".format(
+            p = self.prefix,
+            ar = self.arch,
+            op = opcode_class_name,
+            ex = extended,
+            tb = threadblock,
+            l = self.layout_name(),
+            a = str(max(self.A.alignment, self.B.alignment)))
 
   #
   def configuration_name(self):