Rebased to use Sep 24 g++ host compliation fix.

cfgfung · cfgfung · commit 2c56c1edd4cb · 2025-10-10T02:06:14.000+08:00
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
@@ -240,7 +240,7 @@ struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_,
     TiledMmaQK tiled_mma;
     // To make all threads in a warp have the same global tensors pass in the
     // index of thread 0 in each warp
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx =
         sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
@@ -336,7 +336,7 @@ struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_,
     // Register spill
     Tensor gV_ = take<0, 3>(
         local_tile(gV, select<1, 2>(TileShapePV{}), make_coord(_, _)));
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx =
         sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
@@ -195,7 +195,7 @@ class FlashPrefillEpilogue<epilogue::IntelXeXMX16, MMAOperation_,
     constexpr int FragsM = shape<1>(FragOutLayout{});
     constexpr int FragsN = size(select<2, 3>(shape(FragOutLayout{})));
 
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     auto out_reg = make_tensor(static_cast<decltype(out) &&>(out).data(),
                                Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
     float tLSE_reg = {-INFINITY};
@@ -260,7 +260,7 @@ class FlashPrefillEpilogue<epilogue::IntelXeXMX16, MMAOperation_,
     copy(params.xe_store_o, final_out_reg, tOgO);
 
     // Generating the LSE for backward training
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     int lane_id = static_cast<int>(sg.get_local_linear_id());
     int sub_group_id = get_sub_group_id();
     const int BLK_M = size(select<0>(TileShapeOutput{}));
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
@@ -106,7 +106,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
             class FragSum>
   CUTLASS_DEVICE void scale_exp_log2(FragAcc &frag_s, FragMax const &max,
                                      FragSum &sum) {
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     const auto max_scale = max * params.scale;
     CUTLASS_PRAGMA_UNROLL
     for (int indx = 0; indx < Vec * FragsM; indx++) {
@@ -123,7 +123,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
 
   template <int Vec, int FragsM, int FragsN, class FragSrc, class FragMax>
   CUTLASS_DEVICE void reduce_max(FragSrc &src, FragMax &max) {
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     CUTLASS_PRAGMA_UNROLL
     for (int indx = 0; indx < Vec * FragsM; indx++) {
       auto maxptr = group_broadcast(g, max, indx);
@@ -155,7 +155,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
                   " No. of attention rows per subgroup should be >= 1 MMA Atom "
                   "worth of rows.");
     if (!is_first) {
-      auto g = syclcompat::get_nd_item<1>().get_sub_group();
+      auto g = compat::get_nd_item<1>().get_sub_group();
       Element max_scale{max * params.scale};
       Element exp_scale{
           sycl::native::exp2(max_prev * params.scale - max_scale)};
diff --git a/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp b/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
@@ -190,7 +190,7 @@ struct XeFlashPersistentTileScheduler {
   }
 
   template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
-    auto queue = syclcompat::get_default_queue();
+    auto queue = compat::get_default_queue();
     auto dev = queue.get_device();
     const size_t maxSubgroups =
         dev.template get_info<sycl::info::device::max_num_sub_groups>();
diff --git a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
diff --git a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
diff --git a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
@@ -197,7 +197,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
 
   template <typename SrcT, typename DstT>
   void convert_fp8_to_fp16(const SrcT *d_src, DstT *d_dst, size_t size) {
-    syclcompat::get_default_queue()
+    compat::get_default_queue()
         .parallel_for(
             size,
             [=](auto indx) { d_dst[indx] = static_cast<DstT>(d_src[indx]); })
@@ -298,9 +298,9 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
               seq_len_qo * seq_len_kv,   // batch_stride_S
               seq_len_qo * seq_len_kv    // batch_stride_S
           );
-          syclcompat::wait();
+          compat::wait();
           std::vector<ElementAccumulator> host_S(block_S.size());
-          syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(),
+          compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(),
                                                  host_S.size());
 
           // delete this memory as it is no longer needed
@@ -378,7 +378,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
           cutlass::DeviceAllocation<ElementV_> block_P;
           block_P.reset(host_P.size());
 
-          syclcompat::memcpy<ElementV_>(block_P.get(), host_P.data(),
+          compat::memcpy<ElementV_>(block_P.get(), host_P.data(),
                                         host_P.size());
 
           cutlass::TensorRef ref_P(block_P.get(),
@@ -401,12 +401,12 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
               seq_len_qo * head_size_vo  // batch_stride_O
           );
 
-          syclcompat::wait();
+          compat::wait();
           // delete this memory as it is no longer needed
           block_P.reset();
 
           std::vector<ElementAccumulator> vec_acc(block_acc.size());
-          syclcompat::memcpy<ElementAccumulator>(
+          compat::memcpy<ElementAccumulator>(
               vec_acc.data(), block_acc.get(), vec_acc.size());
 
           // delete this memory as it is no longer needed
@@ -434,11 +434,11 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       offset_o += seq_len_qo * num_heads_q * head_size_vo;
     } // end of batch loop
 
-    syclcompat::wait();
-    syclcompat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(),
+    compat::wait();
+    compat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(),
                                       host_O.size());
-    syclcompat::wait();
-    syclcompat::memcpy<float>(block_ref_LSE.get(), host_LSE.data(),
+    compat::wait();
+    compat::memcpy<float>(block_ref_LSE.get(), host_LSE.data(),
                               host_LSE.size());
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
@@ -613,29 +613,29 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // configure smem size and carveout
     int smem_size = FMHAPrefillKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 // Launch parameters depend on whether SYCL compiler supports work-group scratch
 // memory extension
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FMHAPrefillKernel>>(
         launch_policy{sycl_grid, sycl_block,
                       local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<
                           FMHAPrefillKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props{
+    compat::experimental::launch_properties launch_props{
         sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
         sycl::ext::oneapi::experimental::sub_group_size<
             FMHAPrefillKernel::DispatchPolicy::SubgroupSize>};
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block,
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block,
                                                    launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<
+    auto event = compat::experimental::launch<
         cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
 #endif
 
@@ -681,7 +681,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.is_causal, options.softmax_scale);
@@ -697,7 +697,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         run(params);
       }
-      syclcompat::wait();
+      compat::wait();
       // when seq_len_qo is not equal to seq_len_kv we use bottom up approach
       // for the masking. Following changes will adjust the effective_seq_len_kv
       // when masking applied for such cases
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -107,6 +107,7 @@ if(CUTLASS_ENABLE_SYCL)
       04_bmg_grouped_gemm
       05_bmg_gemm_with_epilogues
       06_bmg_flash_attention
+      06a_bmg_flash_attention_sdpa_fwd_bshd
       07_bmg_dual_gemm
       08_bmg_gemm_f8
       09_bmg_grouped_gemm_f8

Original file line number	Diff line number	Diff line change
`@@ -190,7 +190,7 @@ struct XeFlashPersistentTileScheduler {`
`190`	`190`	`}`
`191`	`191`
`192`	`192`	`template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {`
`193`		`- auto queue = syclcompat::get_default_queue();`
	`193`	`+ auto queue = compat::get_default_queue();`
`194`	`194`	`auto dev = queue.get_device();`
`195`	`195`	`const size_t maxSubgroups =`
`196`	`196`	`dev.template get_info<sycl::info::device::max_num_sub_groups>();`