Fix overflow when calculating workgroups count (#2104)

CuiYifeng · web-flow · commit d5a81e058f1f · 2025-09-29T05:58:44.000Z
To fix #2070. This PR updates several SYCL kernel launch functions in `src/ATen/native/xpu/sycl/Loops.h` to use `int64_t` for workgroup size and number of workgroups calculations. This change prevents overflow issues when handling large tensor sizes.
diff --git a/src/ATen/native/xpu/sycl/Loops.h b/src/ATen/native/xpu/sycl/Loops.h
@@ -314,8 +314,8 @@ static void launch_legacy_group_range_kernel(int64_t N, const func_t& f) {
 
   auto ker = ElementwiseGroupRangeKernel<vec_size, func_t>(N, f);
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz * vec_size);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz * vec_size);
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
 
@@ -328,9 +328,9 @@ static void launch_legacy_global_range_kernel(int64_t N, const func_t& f) {
 
   auto ker = ElementwiseGlobalRangeKernel<func_t>(N, f);
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz);
-  int hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz;
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz);
+  int64_t hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz;
   num_wg = num_wg > hw_max_num_wg ? hw_max_num_wg : num_wg;
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
@@ -355,8 +355,8 @@ static inline void launch_unrolled_kernel(
   auto ker = UnrolledElementwiseKernel(N, f, data, ic, oc, l, s);
   using ker_t = decltype(ker);
 
-  auto wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz * ker_t::item_work_size);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz * ker_t::item_work_size);
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
 
@@ -393,13 +393,13 @@ static inline void launch_vectorized_kernel(
 
 #define VEC_KER(vec_size)                                                    \
   {                                                                          \
-    TORCH_CHECK(max_scalar_bytes* vec_size <= 16);                           \
+    TORCH_CHECK(max_scalar_bytes * vec_size <= 16);                          \
     if constexpr (max_scalar_bytes * vec_size <= 16) {                       \
       auto ker =                                                             \
           VectorizedElementwiseKernel<vec_size, func_t, array_t, in_calc_t>( \
               N, f, data, input_calc);                                       \
-      int num_wg = ceil_div<int>(N, wg_sz * vec_size);                       \
-      sycl_kernel_submit(wg_sz* num_wg, wg_sz, getCurrentSYCLQueue(), ker);  \
+      int64_t num_wg = ceil_div<int64_t>(N, wg_sz * vec_size);               \
+      sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); \
     }                                                                        \
   }
 
@@ -426,7 +426,7 @@ static inline void launch_vectorized_kernel(
           N, f, data, input_calc, output_calc, loader, storer);
       using ker_t = decltype(ker);
 
-      int num_wg = ceil_div<int>(N, wg_sz * ker_t::item_work_size);
+      int64_t num_wg = ceil_div<int64_t>(N, wg_sz * ker_t::item_work_size);
       sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
       break;
     }
@@ -457,8 +457,8 @@ static inline void launch_unrolled_kernel_for_multi_outputs(
       out_calc_t>(N, f, data, ic, oc);
   using ker_t = decltype(ker);
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, ker_t::item_work_size * wg_sz);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, ker_t::item_work_size * wg_sz);
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
 
diff --git a/test/xpu/test_tensor_creation_ops_xpu.py b/test/xpu/test_tensor_creation_ops_xpu.py
@@ -4371,6 +4371,14 @@ def test_full_like_inference(self, device):
             torch.full_like(like, 1.0, dtype=torch.complex64).dtype, torch.complex64
         )
 
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_zeros_large(self, device, dtype):
+        output = torch.zeros(2**31 - 1, device=device, dtype=dtype)
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_ones_large(self, device, dtype):
+        output = torch.ones(2**31 - 1, device=device, dtype=dtype)
+
 
 # Tests for the `frombuffer` function (only work on CPU):
 #   Constructs tensors from Python objects that implement the buffer protocol,