@@ -197,7 +197,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
197197
198198 template <typename SrcT, typename DstT>
199199 void convert_fp8_to_fp16 (const SrcT *d_src, DstT *d_dst, size_t size) {
200- syclcompat ::get_default_queue ()
200+ compat ::get_default_queue ()
201201 .parallel_for (
202202 size,
203203 [=](auto indx) { d_dst[indx] = static_cast <DstT>(d_src[indx]); })
@@ -298,9 +298,9 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
298298 seq_len_qo * seq_len_kv, // batch_stride_S
299299 seq_len_qo * seq_len_kv // batch_stride_S
300300 );
301- syclcompat ::wait ();
301+ compat ::wait ();
302302 std::vector<ElementAccumulator> host_S (block_S.size ());
303- syclcompat ::memcpy<ElementAccumulator>(host_S.data (), block_S.get (),
303+ compat ::memcpy<ElementAccumulator>(host_S.data (), block_S.get (),
304304 host_S.size ());
305305
306306 // delete this memory as it is no longer needed
@@ -378,7 +378,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
378378 cutlass::DeviceAllocation<ElementV_> block_P;
379379 block_P.reset (host_P.size ());
380380
381- syclcompat ::memcpy<ElementV_>(block_P.get (), host_P.data (),
381+ compat ::memcpy<ElementV_>(block_P.get (), host_P.data (),
382382 host_P.size ());
383383
384384 cutlass::TensorRef ref_P (block_P.get (),
@@ -401,12 +401,12 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
401401 seq_len_qo * head_size_vo // batch_stride_O
402402 );
403403
404- syclcompat ::wait ();
404+ compat ::wait ();
405405 // delete this memory as it is no longer needed
406406 block_P.reset ();
407407
408408 std::vector<ElementAccumulator> vec_acc (block_acc.size ());
409- syclcompat ::memcpy<ElementAccumulator>(
409+ compat ::memcpy<ElementAccumulator>(
410410 vec_acc.data (), block_acc.get (), vec_acc.size ());
411411
412412 // delete this memory as it is no longer needed
@@ -434,11 +434,11 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
434434 offset_o += seq_len_qo * num_heads_q * head_size_vo;
435435 } // end of batch loop
436436
437- syclcompat ::wait ();
438- syclcompat ::memcpy<ElementOutput>(block_ref_O.get (), host_O.data (),
437+ compat ::wait ();
438+ compat ::memcpy<ElementOutput>(block_ref_O.get (), host_O.data (),
439439 host_O.size ());
440- syclcompat ::wait ();
441- syclcompat ::memcpy<float >(block_ref_LSE.get (), host_LSE.data (),
440+ compat ::wait ();
441+ compat ::memcpy<float >(block_ref_LSE.get (), host_LSE.data (),
442442 host_LSE.size ());
443443
444444 // Check if output from CUTLASS kernel and reference kernel are equal or not
@@ -613,29 +613,29 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
613613 // configure smem size and carveout
614614 int smem_size = FMHAPrefillKernel::SharedStorageSize;
615615
616- const auto sycl_block = syclcompat ::dim3 (block.x , block.y , block.z );
617- const auto sycl_grid = syclcompat ::dim3 (grid.x , grid.y , grid.z );
616+ const auto sycl_block = compat ::dim3 (block.x , block.y , block.z );
617+ const auto sycl_grid = compat ::dim3 (grid.x , grid.y , grid.z );
618618
619619// Launch parameters depend on whether SYCL compiler supports work-group scratch
620620// memory extension
621621#if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
622- using namespace syclcompat ::experimental;
622+ using namespace compat ::experimental;
623623 auto event = launch<cutlass::device_kernel<FMHAPrefillKernel>>(
624624 launch_policy{sycl_grid, sycl_block,
625625 local_mem_size{static_cast <std::size_t >(smem_size)},
626626 kernel_properties{sycl_exp::sub_group_size<
627627 FMHAPrefillKernel::DispatchPolicy::SubgroupSize>}},
628628 params);
629629#else
630- syclcompat ::experimental::launch_properties launch_props{
630+ compat ::experimental::launch_properties launch_props{
631631 sycl::ext::oneapi::experimental::work_group_scratch_size (smem_size),
632632 };
633- syclcompat ::experimental::kernel_properties kernel_props{
633+ compat ::experimental::kernel_properties kernel_props{
634634 sycl::ext::oneapi::experimental::sub_group_size<
635635 FMHAPrefillKernel::DispatchPolicy::SubgroupSize>};
636- syclcompat ::experimental::launch_policy policy{sycl_grid, sycl_block,
636+ compat ::experimental::launch_policy policy{sycl_grid, sycl_block,
637637 launch_props, kernel_props};
638- auto event = syclcompat ::experimental::launch<
638+ auto event = compat ::experimental::launch<
639639 cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
640640#endif
641641
@@ -681,7 +681,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
681681 // Run the GEMM
682682 run (params);
683683
684- syclcompat ::wait ();
684+ compat ::wait ();
685685
686686 // Verify that the result is correct
687687 bool passed = verify (problem_size, options.is_causal , options.softmax_scale );
@@ -697,7 +697,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
697697 for (int i = 0 ; i < options.iterations ; ++i) {
698698 run (params);
699699 }
700- syclcompat ::wait ();
700+ compat ::wait ();
701701 // when seq_len_qo is not equal to seq_len_kv we use bottom up approach
702702 // for the masking. Following changes will adjust the effective_seq_len_kv
703703 // when masking applied for such cases
0 commit comments