diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml
index aa751bcf4158..42137b8de70a 100644
--- a/.github/workflows/sycl-rel-nightly.yml
+++ b/.github/workflows/sycl-rel-nightly.yml
@@ -32,6 +32,11 @@ jobs:
             image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
             target_devices: hip:gpu
 
+          - name: NVIDIA/CUDA
+            runner: '["Linux", "cuda"]'
+            image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
+            target_devices: cuda:gpu
+
           - name: Intel L0 Gen12 GPU
             runner: '["Linux", "gen12"]'
             image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
@@ -43,6 +48,11 @@ jobs:
             image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
             target_devices: level_zero:gpu
 
+          - name: Intel L0 Arc A-Series GPU
+            runner: '["Linux", "arc"]'
+            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+            target_devices: level_zero:gpu
+
           - name: Intel OCL Gen12 GPU
             runner: '["Linux", "gen12"]'
             image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
@@ -102,40 +112,6 @@ jobs:
       extra_lit_opts: ${{ matrix.extra_lit_opts }}
       repo_ref: ${{ github.sha  }}
 
-  cuda-aws-start:
-    needs: [ubuntu2204_build]
-    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
-    uses: ./.github/workflows/sycl-aws.yml
-    secrets: inherit
-    with:
-      mode: start
-      ref: ${{ github.sha  }}
-
-  cuda-run-tests:
-    needs: [ubuntu2204_build, cuda-aws-start]
-    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
-    uses: ./.github/workflows/sycl-linux-run-tests.yml
-    with:
-      name: CUDA E2E
-      runner: '["aws_cuda-${{ github.run_id }}-${{ github.run_attempt }}"]'
-      image: ghcr.io/intel/llvm/ubuntu2204_build:latest-133fee559371ce0e6ff867e378c21cde2bdf6c90
-      image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
-      target_devices: cuda:gpu
-      repo_ref: ${{ github.sha  }}
-
-      sycl_toolchain_artifact: sycl_linux_default
-      sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
-      sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
-
-  cuda-aws-stop:
-    needs: [cuda-aws-start, cuda-run-tests]
-    if: always() && ${{ needs.cuda-aws-start.result != 'skipped' }}
-    uses: ./.github/workflows/sycl-aws.yml
-    secrets: inherit
-    with:
-      mode: stop
-      ref: ${{ github.sha  }}
-
   build-sycl-cts:
     needs: ubuntu2204_build
     if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
diff --git a/sycl/test-e2e/ESIMD/group_barrier.cpp b/sycl/test-e2e/ESIMD/group_barrier.cpp
index 7f9164403229..6a4613ef9f53 100644
--- a/sycl/test-e2e/ESIMD/group_barrier.cpp
+++ b/sycl/test-e2e/ESIMD/group_barrier.cpp
@@ -14,40 +14,54 @@
 #include "esimd_test_utils.hpp"
 #include <sycl/ext/oneapi/experimental/root_group.hpp>
 #include <sycl/group_barrier.hpp>
+#include <sycl/kernel_bundle.hpp>
 
-static constexpr int WorkGroupSize = 16;
+namespace syclex = sycl::ext::oneapi::experimental;
+
+static constexpr int WorkGroupSize = 32;
 
 static constexpr int VL = 16;
+
+template <int Val> class MyKernel;
+
 template <bool UseThisWorkItemAPI> bool test(sycl::queue &q) {
   bool Pass = true;
-  const auto MaxWGs = 8;
-  size_t WorkItemCount = MaxWGs * WorkGroupSize * VL;
   std::cout << "Test case UseThisWorkItemAPI="
             << std::to_string(UseThisWorkItemAPI) << std::endl;
   const auto Props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
-  sycl::buffer<int> DataBuf{sycl::range{WorkItemCount}};
-  const auto Range = sycl::nd_range<1>{MaxWGs * WorkGroupSize, WorkGroupSize};
+  auto Bundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
+  auto Kernel = Bundle.template get_kernel<MyKernel<UseThisWorkItemAPI>>();
+  sycl::range<3> LocalRange{WorkGroupSize, 1, 1};
+  auto MaxWGs = Kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(q, LocalRange,
+                                                                0);
+  auto GlobalRange = LocalRange;
+  size_t WorkItemCount = GlobalRange.size() * VL;
+  sycl::buffer<int> DataBuf{WorkItemCount};
+  const auto Range = sycl::nd_range<3>{GlobalRange, LocalRange};
   q.submit([&](sycl::handler &h) {
      sycl::accessor Data{DataBuf, h};
-     h.parallel_for(Range, Props, [=](sycl::nd_item<1> it) SYCL_ESIMD_KERNEL {
-       int ID = it.get_global_linear_id();
-       __ESIMD_NS::simd<int, VL> V(ID, 1);
-       // Write data to another kernel's data to verify the barrier works.
-       __ESIMD_NS::block_store(
-           Data, (WorkItemCount * sizeof(int)) - (ID * sizeof(int) * VL), V);
-       if constexpr (UseThisWorkItemAPI) {
-         auto Root =
-             sycl::ext::oneapi::experimental::this_work_item::get_root_group<
-                 1>();
-         sycl::group_barrier(Root);
-       } else {
-         auto Root = it.ext_oneapi_get_root_group();
-         sycl::group_barrier(Root);
-       }
-       __ESIMD_NS::simd<int, VL> VOther(ID * VL, 1);
-       __ESIMD_NS::block_store(Data, ID * sizeof(int) * VL, VOther);
-     });
+     h.parallel_for<MyKernel<UseThisWorkItemAPI>>(
+         Range, Props, [=](sycl::nd_item<3> it) SYCL_ESIMD_KERNEL {
+           int ID = it.get_global_linear_id();
+           __ESIMD_NS::simd<int, VL> V(ID, 1);
+           // Write data to another kernel's data to verify the barrier works.
+           __ESIMD_NS::block_store(
+               Data, (WorkItemCount * sizeof(int)) - (ID * sizeof(int) * VL),
+               V);
+           if constexpr (UseThisWorkItemAPI) {
+             auto Root = sycl::ext::oneapi::experimental::this_work_item::
+                 get_root_group<1>();
+             sycl::group_barrier(Root);
+           } else {
+             auto Root = it.ext_oneapi_get_root_group();
+             sycl::group_barrier(Root);
+           }
+           __ESIMD_NS::simd<int, VL> VOther(ID * VL, 1);
+           __ESIMD_NS::block_store(Data, ID * sizeof(int) * VL, VOther);
+         });
    }).wait();
   sycl::host_accessor Data{DataBuf};
   int ErrCnt = 0;
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
index 6cdbb29690fd..f0775c7d698f 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
@@ -1,5 +1,3 @@
-// XFAIL: run-mode && gpu-intel-dg2
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18579
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
index 91beb23ea01b..fb6d9ea0e0e7 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
@@ -1,5 +1,3 @@
-// XFAIL: run-mode && gpu-intel-dg2
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18579
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
index 996f8c98a3ac..38ae6bf29bf1 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_SLM.cpp
@@ -8,7 +8,7 @@
 // UNSUPPORTED: target-nvidia, target-amd
 // UNSUPPORTED-INTENDED: aspect-ext_intel_matrix isn't currently supported for
 // other triples
-// XFAIL: run-mode && gpu-intel-dg2
+// XFAIL: run-mode && igc-dev
 // XFAIL-TRACKER: CMPLRLLVM-66371
 
 // REQUIRES: aspect-ext_intel_matrix, gpu
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
index 0f3b2b93dcd9..11c00d3842ed 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
@@ -17,7 +17,7 @@
 
 // Waiting for the commit in IGC to be pulled into the driver to resolve the
 // test.
-// XFAIL: gpu-intel-dg2 && run-mode
+// XFAIL: run-mode && igc-dev
 // XFAIL-TRACKER: GSD-10510
 
 #include "common.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
index a290a8cb00a6..4fd2463cbd04 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -18,7 +18,7 @@
 
 // Waiting for the commit in IGC to be pulled into the driver to resolve the
 // test.
-// XFAIL: gpu-intel-dg2 && run-mode
+// XFAIL: run-mode && igc-dev
 // XFAIL-TRACKER: GSD-10510
 
 #include "common.hpp"