diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
index 30af9c9cfc0..e7449a11bf1 100644
--- a/Src/Base/AMReX_BaseFabUtility.H
+++ b/Src/Base/AMReX_BaseFabUtility.H
@@ -53,14 +53,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
             std::uint64_t const threadIdxx = handler.threadIdx();
             std::uint64_t const blockIdxx = handler.blockIdx();
             auto const shared = (T*)handler.sharedMemory();
-            if (icell < ntotcells) {
+            if (icell < indexer.numPts()) {
                 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
                 auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             handler.sharedBarrier();
             for (std::uint64_t m = threadIdxx,
-                     mend = amrex::min<std::uint64_t>(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE;
+                     mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
                  m < mend; m += blockDimx) {
                 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
             }
@@ -72,14 +72,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
             std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
             Gpu::SharedMemory<T> gsm;
             T* const shared = gsm.dataPtr();
-            if (icell < ntotcells) {
+            if (icell < indexer.numPts()) {
                 auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
                 auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             __syncthreads();
             for (std::uint64_t m = threadIdx.x,
-                     mend = amrex::min<std::uint64_t>(blockDim.x, ntotcells-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
+                     mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
                  m < mend; m += blockDim.x) {
                 p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
             }
diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
index e7f6f2d5a48..82a2ad9cd13 100644
--- a/Src/Base/AMReX_Box.H
+++ b/Src/Base/AMReX_Box.H
@@ -1842,13 +1842,16 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
 
 struct BoxIndexer
 {
+    std::uint64_t npts;
+
 #if (AMREX_SPACEDIM == 3)
     Math::FastDivmodU64 fdxy;
     Math::FastDivmodU64 fdx;
     IntVect lo;
 
     BoxIndexer (Box const& box)
-        : fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
+        : npts(box.numPts()),
+          fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
           fdx (std::uint64_t(box.length(0))),
           lo  (box.smallEnd())
         {}
@@ -1877,7 +1880,8 @@ struct BoxIndexer
     IntVect lo;
 
     BoxIndexer (Box const& box)
-        : fdx (std::uint64_t(box.length(0))),
+        : npts(box.numPts()),
+          fdx (std::uint64_t(box.length(0))),
           lo  (box.smallEnd())
         {}
 
@@ -1902,7 +1906,8 @@ struct BoxIndexer
     int lo;
 
     BoxIndexer (Box const& box)
-        : lo(box.smallEnd(0))
+        : npts(box.numPts()),
+          lo(box.smallEnd(0))
         {}
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -1918,6 +1923,9 @@ struct BoxIndexer
     }
 
 #endif
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    std::uint64_t numPts () const { return npts; }
 };
 
 }
diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H
index e2cf0ed9641..21029eff840 100644
--- a/Src/Base/AMReX_FabArrayBase.H
+++ b/Src/Base/AMReX_FabArrayBase.H
@@ -651,7 +651,7 @@ public:
         ~ParForInfo ();
 
         std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
-        Box const* getBoxes () const { return m_boxes; }
+        BoxIndexer const* getBoxes () const { return m_boxes; }
 
         ParForInfo () = delete;
         ParForInfo (ParForInfo const&) = delete;
@@ -663,7 +663,7 @@ public:
         IntVect m_ng;
         int m_nthreads;
         std::pair<int*,int*> m_nblocks_x;
-        Box* m_boxes = nullptr;
+        BoxIndexer* m_boxes = nullptr;
         char* m_hp = nullptr;
         char* m_dp = nullptr;
     };
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index a280968c773..5e28ba20884 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -239,9 +239,8 @@ template <int MT, typename L>
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -257,9 +256,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
+                         icell < indexer.numPts(); icell += stride) {
                         auto [i, j, k] = indexer(icell);
-                        int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)),
+                        int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
                                                           std::uint64_t(item.get_local_range(0)));
                         detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                                 n_active_threads});
@@ -275,7 +274,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
+                         icell < indexer.numPts(); icell += stride) {
                         auto [i, j, k] = indexer(icell);
                         detail::call_f(f,i,j,k,Gpu::Handler{&item});
                     }
@@ -291,9 +290,8 @@ template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_in
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -309,9 +307,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
+                         icell < indexer.numPts(); icell += stride) {
                         auto [i, j, k] = indexer(icell);
-                        int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)),
+                        int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
                                                           std::uint64_t(item.get_local_range(0)));
                         detail::call_f(f, i, j, k, ncomp,
                                        Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
@@ -328,7 +326,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
-                         icell < ncells; icell += stride) {
+                         icell < indexer.numPts(); icell += stride) {
                         auto [i, j, k] = indexer(icell);
                         detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item});
                     }
@@ -377,9 +375,8 @@ template <typename L>
 void ParallelForRNG (Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
@@ -397,7 +394,7 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
                 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
+                     icell < indexer.numPts(); icell += stride) {
                     auto [i, j, k] = indexer(icell);
                     f(i,j,k,rand_eng);
                 }
@@ -414,9 +411,8 @@ template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T
 void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
@@ -434,7 +430,7 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
                 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
+                     icell < indexer.numPts(); icell += stride) {
                     auto [i, j, k] = indexer(icell);
                     for (T n = 0; n < ncomp; ++n) {
                         f(i,j,k,n,rand_eng);
@@ -453,12 +449,9 @@ template <int MT, typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -470,13 +463,14 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
+                auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
+                    if (icell < indexer1.numPts()) {
                         auto [i, j, k] = indexer1(icell);
                         f1(i,j,k);
                     }
-                    if (icell < ncells2) {
+                    if (icell < indexer2.numPts()) {
                         auto [i, j, k] = indexer2(icell);
                         f2(i,j,k);
                     }
@@ -494,14 +488,10 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells3 = std::uint64_t(box3.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
     const BoxIndexer indexer3(box3);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -513,17 +503,18 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
+                auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
+                    if (icell < indexer1.numPts()) {
                         auto [i, j, k] = indexer1(icell);
                         f1(i,j,k);
                     }
-                    if (icell < ncells2) {
+                    if (icell < indexer2.numPts()) {
                         auto [i, j, k] = indexer2(icell);
                         f2(i,j,k);
                     }
-                    if (icell < ncells3) {
+                    if (icell < indexer3.numPts()) {
                         auto [i, j, k] = indexer3(icell);
                         f3(i,j,k);
                     }
@@ -543,12 +534,9 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -560,15 +548,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
+                auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
+                    if (icell < indexer1.numPts()) {
                         auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
                             f1(i,j,k,n);
                         }
                     }
-                    if (icell < ncells2) {
+                    if (icell < indexer2.numPts()) {
                         auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
                             f2(i,j,k,n);
@@ -592,14 +581,10 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells3 = std::uint64_t(box3.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
     const BoxIndexer indexer3(box3);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     const auto nthreads_per_block = ec.numThreads.x;
     const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -611,21 +596,22 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
+                auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
                 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    if (icell < ncells1) {
+                    if (icell < indexer1.numPts()) {
                         auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
                             f1(i,j,k,n);
                         }
                     }
-                    if (icell < ncells2) {
+                    if (icell < indexer2.numPts()) {
                         auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
                             f2(i,j,k,n);
                         }
                     }
-                    if (icell < ncells3) {
+                    if (icell < indexer3.numPts()) {
                         auto [i, j, k] = indexer3(icell);
                         for (T3 n = 0; n < ncomp3; ++n) {
                             f3(i,j,k,n);
@@ -768,16 +754,15 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < ncells; icell += stride)
+             icell < indexer.numPts(); icell += stride)
         {
             auto [i, j, k] = indexer(icell);
-            detail::call_f(f, i, j, k, (ncells-icell+(std::uint64_t)threadIdx.x));
+            detail::call_f(f, i, j, k, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -788,15 +773,14 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
-             icell < ncells; icell += stride) {
+             icell < indexer.numPts(); icell += stride) {
             auto [i, j, k] = indexer(icell);
-            detail::call_f(f, i, j, k, ncomp, (ncells-icell+(std::uint64_t)threadIdx.x));
+            detail::call_f(f, i, j, k, ncomp, (indexer.numPts()-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -829,16 +813,15 @@ ParallelForRNG (Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) {
+        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
             auto [i, j, k] = indexer(icell);
             f(i,j,k,engine);
         }
@@ -853,16 +836,15 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    const auto ncells = std::uint64_t(box.numPts());
     const BoxIndexer indexer(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::ExecutionConfig(box.numPts());
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) {
+        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
             auto [i, j, k] = indexer(icell);
             for (T n = 0; n < ncomp; ++n) {
                 f(i,j,k,n,engine);
@@ -879,21 +861,19 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
+        auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
+            if (icell < indexer1.numPts()) {
                 auto [i, j, k] = indexer1(icell);
                 f1(i,j,k);
             }
-            if (icell < ncells2) {
+            if (icell < indexer2.numPts()) {
                 auto [i, j, k] = indexer2(icell);
                 f2(i,j,k);
             }
@@ -909,27 +889,24 @@ ParallelFor (Gpu::KernelInfo const&,
              L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells3 = std::uint64_t(box3.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
     const BoxIndexer indexer3(box3);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
+        auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
+            if (icell < indexer1.numPts()) {
                 auto [i, j, k] = indexer1(icell);
                 f1(i,j,k);
             }
-            if (icell < ncells2) {
+            if (icell < indexer2.numPts()) {
                 auto [i, j, k] = indexer2(icell);
                 f2(i,j,k);
             }
-            if (icell < ncells3) {
+            if (icell < indexer3.numPts()) {
                 auto [i, j, k] = indexer3(icell);
                 f3(i,j,k);
             }
@@ -947,23 +924,21 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
+        auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
+            if (icell < indexer1.numPts()) {
                 auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
                     f1(i,j,k,n);
                 }
             }
-            if (icell < ncells2) {
+            if (icell < indexer2.numPts()) {
                 auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
                     f2(i,j,k,n);
@@ -985,31 +960,28 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = std::uint64_t(box1.numPts());
-    const auto ncells2 = std::uint64_t(box2.numPts());
-    const auto ncells3 = std::uint64_t(box3.numPts());
-    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
     const BoxIndexer indexer1(box1);
     const BoxIndexer indexer2(box2);
     const BoxIndexer indexer3(box3);
-    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
+        auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
         for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            if (icell < ncells1) {
+            if (icell < indexer1.numPts()) {
                 auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
                     f1(i,j,k,n);
                 }
             }
-            if (icell < ncells2) {
+            if (icell < indexer2.numPts()) {
                 auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
                     f2(i,j,k,n);
                 }
             }
-            if (icell < ncells3) {
+            if (icell < indexer3.numPts()) {
                 auto [i, j, k] = indexer3(icell);
                 for (T3 n = 0; n < ncomp3; ++n) {
                     f3(i,j,k,n);
diff --git a/Src/Base/AMReX_MFParallelForG.H b/Src/Base/AMReX_MFParallelForG.H
index ba65b18937d..80260ca0eb5 100644
--- a/Src/Base/AMReX_MFParallelForG.H
+++ b/Src/Base/AMReX_MFParallelForG.H
@@ -12,36 +12,33 @@ namespace amrex {
 namespace detail {
 
 inline
-void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, Box*& pboxes,
+void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,
                             Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)
 {
     if (!ncells.empty()) {
         const int nboxes = ncells.size();
-        const std::size_t nbytes_boxes = amrex::aligned_size(16, (nboxes+1) * sizeof(int));
-        const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(Box);
+        const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));
+        const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);
         a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);
         int* hp_blks = (int*)a_hp;
-        Box* hp_boxes = (Box*)(a_hp + nbytes_boxes);
+        auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
         hp_blks[0] = 0;
-        Long ntot = 0;
         bool same_size = true;
         for (int i = 0; i < nboxes; ++i) {
             Long nblocks = (ncells[i] + nthreads-1) / nthreads;
+            AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));
             hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);
-            ntot += nblocks;
             same_size = same_size && (ncells[i] == ncells[0]);
 
-            new (hp_boxes+i) Box(boxes[i]);
+            new (hp_boxes+i) BoxIndexer(boxes[i]);
         }
-        amrex::ignore_unused(ntot);
-        AMREX_ASSERT(static_cast<Long>(hp_blks[nboxes]) == ntot); // no overflow
 
         a_dp = (char*) The_Arena()->alloc(nbytes);
         Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);
 
         blocks_x.first = hp_blks;
         blocks_x.second = (same_size) ? nullptr : (int*)a_dp;
-        pboxes = (Box*)(a_dp + nbytes_boxes);
+        pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
     }
 }
 
@@ -94,7 +91,7 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
         const int nblocks = par_for_blocks.first[nboxes];
         const int block_0_size = par_for_blocks.first[1];
         const int* dp_nblocks = par_for_blocks.second;
-        const Box* dp_boxes = parforinfo.getBoxes();
+        const BoxIndexer* dp_boxes = parforinfo.getBoxes();
 
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
@@ -102,13 +99,14 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
             <<<nblocks, MT, 0, Gpu::gpuStream()>>>
             ([=] AMREX_GPU_DEVICE () noexcept
              {
-                 int ibox, icell;
+                 int ibox;
+                 std::uint64_t icell;
                  if (dp_nblocks) {
                      ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));
-                     icell = (blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
+                     icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
                  } else {
                      ibox = blockIdx.x / block_0_size;
-                     icell = (blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
+                     icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
                  }
 
 #elif defined(AMREX_USE_SYCL)
@@ -116,27 +114,21 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
         amrex::launch<MT>(nblocks, Gpu::gpuStream(),
              [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
              {
-                 int ibox, icell;
+                 int ibox;
+                 std::uint64_t icell;
                  int blockIdxx = item.get_group_linear_id();
                  int threadIdxx = item.get_local_linear_id();
                  if (dp_nblocks) {
                      ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));
-                     icell = (blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
+                     icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
                  } else {
                      ibox = blockIdxx / block_0_size;
-                     icell = (blockIdxx-ibox*block_0_size)*MT + threadIdxx;
+                     icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;
                  }
 #endif
-                 Box const& b = dp_boxes[ibox];
-                 int ncells = b.numPts();
-                 if (icell < ncells) {
-                     const auto len = amrex::length(b);
-                     int k =  icell /   (len.x*len.y);
-                     int j = (icell - k*(len.x*len.y)) /   len.x;
-                     int i = (icell - k*(len.x*len.y)) - j*len.x;
-                     AMREX_D_TERM(i += b.smallEnd(0);,
-                                  j += b.smallEnd(1);,
-                                  k += b.smallEnd(2);)
+                 BoxIndexer const& indexer = dp_boxes[ibox];
+                 if (icell < indexer.numPts()) {
+                     auto [i, j, k] = indexer(icell);
                      for (int n = 0; n < ncomp; ++n) {
                          parfor_mf_detail::call_f(f, ibox, i, j, k, n);
                      }
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index aa1a4dd0122..bf49c3df0a8 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -376,11 +376,12 @@ public:
             const int nblocks = par_for_blocks.first[nboxes];
             const int block_0_size = par_for_blocks.first[1];
             const int* dp_nblocks = par_for_blocks.second;
-            const Box* dp_boxes = parforinfo.getBoxes();
+            const BoxIndexer* dp_boxes = parforinfo.getBoxes();
 
             auto const& stream = Gpu::gpuStream();
             auto pdst = reduce_data.devicePtr(stream);
             int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
+            AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
             reduce_data.nBlocks(stream) = nblocks_ec;
             reduce_data.updateMaxStreamIndex(stream);
 
@@ -405,25 +406,19 @@ public:
                     dst = r;
                 }
                 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
-                    int ibox, icell;
+                    int ibox;
+                    std::uint64_t icell;
                     if (dp_nblocks) {
                         ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock);
-                        icell = (iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
+                        icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
                     } else {
                         ibox = iblock / block_0_size;
-                        icell = (iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
+                        icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
                     }
 
-                    Box const& b = dp_boxes[ibox];
-                    int ncells = b.numPts();
-                    if (icell < ncells) {
-                        const auto len = amrex::length(b);
-                        int k =  icell /   (len.x*len.y);
-                        int j = (icell - k*(len.x*len.y)) /   len.x;
-                        int i = (icell - k*(len.x*len.y)) - j*len.x;
-                        AMREX_D_TERM(i += b.smallEnd(0);,
-                                     j += b.smallEnd(1);,
-                                     k += b.smallEnd(2););
+                    BoxIndexer const& indexer = dp_boxes[ibox];
+                    if (icell < indexer.numPts()) {
+                        auto [i, j, k] = indexer(icell);
                         Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
                             (f, ibox, i, j, k, ncomp, r);
                     }