diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3891a11b..88113295 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,28 @@ include(ResolveGitVersion)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CUDA_STANDARD 20)
+
+# Set CUDA compute capability target
+# Override with just the number: cmake -DCUDA_ARCH=86 ..
+# Set default to 7.5 (Turing)
+set(CUDA_ARCH "75 - Turing (RTX 20xx, GTX 16xx)" CACHE STRING "CUDA compute capability target architecture")
+set_property(CACHE CUDA_ARCH PROPERTY STRINGS 
+    "52 - Maxwell (GTX 9xx, Titan X)"
+    "60 - Pascal (GTX 10xx, P100)"
+    "61 - Pascal (GTX 1050/1030)"
+    "70 - Volta (V100, Titan V100)"
+    "75 - Turing (RTX 20xx, GTX 16xx)"
+    "80 - Ampere (A100)"
+    "86 - Ampere (RTX 30xx)"
+    "89 - Ada Lovelace (RTX 40xx)"
+    "90 - Hopper (H100)" 
+)
+
+# Extract just the number from the selection (first two characters)
+string(SUBSTRING "${CUDA_ARCH}" 0 2 CUDA_ARCH_NUMBER)
+set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_NUMBER})
+message(STATUS "CUDA architecture set to: sm_${CMAKE_CUDA_ARCHITECTURES}")
+
 project(fast-feedback-service LANGUAGES CXX VERSION ${FFS_VERSION_CMAKE})
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS yes)
@@ -111,6 +133,36 @@ find_package(CUDAToolkit)
 if (CUDAToolkit_FOUND)
     message(STATUS "CUDA found: Building CUDA components.")
     set(FFS_ENABLE_CUDA ON)
+    
+    # Try to detect GPU compute capability
+    execute_process(
+        COMMAND nvidia-smi --query-gpu=compute_cap --format=csv,noheader
+        OUTPUT_VARIABLE DETECTED_GPU_CAPS
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    
+    if(DETECTED_GPU_CAPS)
+        # nvidia-smi returns "7.5" format, convert to "75"
+        string(REPLACE "." "" DETECTED_GPU_ARCH "${DETECTED_GPU_CAPS}")
+        # Get first GPU if multiple
+        string(REGEX MATCH "[0-9]+" DETECTED_GPU_ARCH "${DETECTED_GPU_ARCH}")
+        message(STATUS "Detected GPU compute capability: sm_${DETECTED_GPU_ARCH}")
+        
+        # Warn if configured architecture doesn't match detected GPU
+        if(NOT CUDA_ARCH_NUMBER STREQUAL DETECTED_GPU_ARCH)
+            message(WARNING 
+                "Target CUDA architecture (sm_${CUDA_ARCH_NUMBER}) differs from detected GPU (sm_${DETECTED_GPU_ARCH}).\n"
+                "   The compiled code may not run optimally or at all on this system.\n"
+                "   To build for the detected GPU: cmake -DCUDA_ARCH=${DETECTED_GPU_ARCH} ..\n"
+                "   To build for a different target: ensure the target system has sm_${CUDA_ARCH_NUMBER} or newer."
+            )
+        else()
+            message(STATUS "✓ Configured architecture matches detected GPU")
+        endif()
+    else()
+        message(STATUS "Could not detect GPU compute capability (nvidia-smi not available or no GPU present)")
+    endif()
 else()
     message(STATUS "CUDA not found: Skipping CUDA components.")
     set(FFS_ENABLE_CUDA OFF)
diff --git a/include/math/device_precision.cuh b/include/math/device_precision.cuh
index 0a7adfa0..8f02b4d1 100644
--- a/include/math/device_precision.cuh
+++ b/include/math/device_precision.cuh
@@ -34,4 +34,15 @@ using scalar_t = float;
 #define CUDA_ABS fabsf
 #define CUDA_SQRT sqrtf
 #define CUDA_EXP expf
-#endif
\ No newline at end of file
+#endif
+
+// Accumulator type for summation/reduction operations.
+// Integer accumulation is used because:
+// 1. atomicAdd on uint32_t is a single native hardware instruction (fastest atomic)
+// 2. Maximum possible sum is well within uint32_t range (~20M << 4.3G)
+// 3. Final reduction to double is performed on the host
+//
+// NOTE: If pixel_t can be negative (pedestal-subtracted data), change to int32_t.
+// NOTE: If pixel values are non-integer (e.g. gain-corrected floats), revert to:
+//     using accumulator_t = scalar_t;
+using accumulator_t = uint32_t;
\ No newline at end of file
diff --git a/include/math/math_utils.cuh b/include/math/math_utils.cuh
index 42371bf5..9740274f 100644
--- a/include/math/math_utils.cuh
+++ b/include/math/math_utils.cuh
@@ -13,6 +13,7 @@
  * constexpr scalar_t RAD_TO_DEG = 180.0 / cuda::std::numbers::pi_v<scalar_t>;
  */
 #include <cmath>
+#include <concepts>
 
 #ifdef __CUDACC__
 #include "math/device_precision.cuh"
@@ -43,4 +44,24 @@ DEVICE_HOST scalar_type degrees_to_radians(scalar_type degrees) {
     return degrees * DEG_TO_RAD;
 }
 
+/**
+ * @brief Integer ceiling division
+ * 
+ * Computes ceil(n/d) using integer arithmetic only, avoiding
+ * floating-point conversion. Equivalent to (n + d - 1) / d, which
+ * rounds up to the nearest integer quotient.
+ * 
+ * @tparam T Integer type
+ * @param n Numerator
+ * @param d Denominator (must be > 0)
+ * @return Ceiling of n/d
+ * 
+ * @example ceil_div(15, 4) returns 4, whereas 15/4 returns 3
+ */
+template <typename T>
+DEVICE_HOST constexpr T ceil_div(T n, T d) {
+    static_assert(std::is_integral_v<T>, "ceil_div requires an integer type");
+    return (n + d - 1) / d;
+}
+
 #undef DEVICE_HOST
\ No newline at end of file
diff --git a/integrator/CMakeLists.txt b/integrator/CMakeLists.txt
index a5ef06d6..aeb4b74f 100644
--- a/integrator/CMakeLists.txt
+++ b/integrator/CMakeLists.txt
@@ -6,6 +6,9 @@ find_package(Bitshuffle REQUIRED)
 
 add_executable(integrator
     integrator.cc
+    integrator.cu
+    kabsch.cu
+    extent.cc
     ../spotfinder/shmread.cc
     ../spotfinder/cbfread.cc
 )
@@ -39,4 +42,7 @@ target_link_libraries(integrator
 # Allow CUDA to use multi-file device variables
 set_property(TARGET integrator PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 target_compile_options(integrator PRIVATE "$<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-G>")
-target_compile_options(integrator PRIVATE "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:--generate-line-info>")
\ No newline at end of file
+target_compile_options(integrator PRIVATE "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:--generate-line-info>")
+
+# Add tests subdirectory
+add_subdirectory(tests)
\ No newline at end of file
diff --git a/integrator/extent.cc b/integrator/extent.cc
new file mode 100644
index 00000000..bf03cf10
--- /dev/null
+++ b/integrator/extent.cc
@@ -0,0 +1,191 @@
+/**
+ * @file extent.cc
+ * @brief Extent and bounding box algorithms for baseline CPU implementation
+ */
+
+#include "extent.hpp"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+
+#include "ffs_logger.hpp"
+
+std::vector<BoundingBoxExtents> compute_kabsch_bounding_boxes(
+  const Eigen::Vector3d &s0,
+  const Eigen::Vector3d &rot_axis,
+  const mdspan_2d_double &s1_vectors,
+  const mdspan_2d_double &phi_positions,
+  const size_t num_reflections,
+  const double sigma_b,
+  const double sigma_m,
+  const Panel &panel,
+  const Scan &scan,
+  const MonochromaticBeam &beam,
+  const double n_sigma,
+  const double sigma_b_multiplier) {
+    std::vector<BoundingBoxExtents> extents;
+    extents.reserve(num_reflections);
+
+    /*
+    * Tolerance for detecting when a reflection is nearly parallel to
+    * the rotation axis. When ζ = m₂ · e₁ approaches zero, it indicates
+    * the reflection's scattering plane is nearly parallel to the
+    * goniometer rotation axis, making the φ-to-image conversion
+    * numerically unstable. This threshold (1e-10) is chosen based on
+    * geometric considerations rather than pure floating-point precision
+    * - it represents a practical limit for "nearly parallel" geometry
+    * where the standard bounding box calculation should be bypassed in
+    * favor of spanning the entire image range.
+    */
+    static constexpr double ZETA_TOLERANCE = 1e-10;
+
+    // Calculate the angular divergence parameters:
+    // Δb = nσ × σb × m (beam divergence extent)
+    // Δm = nσ × σm (mosaicity extent)
+    double delta_b = n_sigma * sigma_b * sigma_b_multiplier;
+    double delta_m = n_sigma * sigma_m;
+
+    // Extract experimental parameters needed for coordinate transformations
+    const auto [osc_start, osc_width] = scan.get_oscillation();
+    int image_range_start = scan.get_image_range()[0];
+    int image_range_end = scan.get_image_range()[1];
+    double wl = beam.get_wavelength();
+    Matrix3d d_matrix_inv = panel.get_d_matrix().inverse();
+
+    // Process each reflection individually
+    for (size_t i = 0; i < num_reflections; ++i) {
+        // Extract reflection centroid data
+        Eigen::Vector3d s1_c(
+          s1_vectors(i, 0), s1_vectors(i, 1), s1_vectors(i, 2));  // s₁ᶜ from s1_vectors
+        double phi_c = (phi_positions(i, 2));  // φᶜ from xyzcal.mm column
+
+        // Construct the Kabsch coordinate system for this reflection
+        // e1 = s₁ᶜ × s₀ / |s₁ᶜ × s₀| (perpendicular to scattering plane)
+        Eigen::Vector3d e1 = s1_c.cross(s0).normalized();
+        // e2 = s₁ᶜ × e₁ / |s₁ᶜ × e₁| (within scattering plane, orthogonal to e1)
+        Eigen::Vector3d e2 = s1_c.cross(e1).normalized();
+
+        double s1_len = s1_c.norm();
+
+        // Calculate s′ vectors at the four corners of the integration region
+        // These correspond to the extremes: (±Δb, ±Δb) in Kabsch coordinates
+        std::vector<Eigen::Vector3d> s_prime_vectors;
+        static constexpr std::array<std::pair<int, int>, 4> corner_signs = {
+          {{1, 1}, {1, -1}, {-1, 1}, {-1, -1}}};
+
+        for (auto [e1_sign, e2_sign] : corner_signs) {
+            // Project Δb divergences onto Kabsch basis vectors
+            // p represents the displacement in reciprocal space
+            Eigen::Vector3d p =
+              (e1_sign * delta_b * e1 / s1_len) + (e2_sign * delta_b * e2 / s1_len);
+
+            // Debug output for the Ewald sphere calculation
+            double p_magnitude = p.norm();
+            logger.trace(
+              "Reflection {}, corner ({},{}): p.norm()={:.6f}, s1_len={:.6f}, "
+              "delta_b={:.6f}",
+              i,
+              e1_sign,
+              e2_sign,
+              p_magnitude,
+              s1_len,
+              delta_b);
+
+            // Ensure the resulting s′ vector lies on the Ewald sphere
+            // This involves solving: |s′|² = |s₁ᶜ|² for the correct magnitude
+            double b = s1_len * s1_len - p.dot(p);
+            if (b < 0) {
+                logger.error(
+                  "Negative b value: {:.6f} for reflection {} (p.dot(p)={:.6f}, "
+                  "s1_len²={:.6f})",
+                  b,
+                  i,
+                  p.dot(p),
+                  s1_len * s1_len);
+                logger.error(
+                  "This means the displacement vector is too large for the Ewald "
+                  "sphere");
+                // Skip this corner or use a fallback approach
+                continue;
+            }
+            double d = -(p.dot(s1_c) / s1_len) + std::sqrt(b);
+
+            logger.trace("Reflection {}: b={:.6f}, d={:.6f}", i, b, d);
+
+            // Construct the s′ vector: s′ = (d × ŝ₁ᶜ) + p
+            Eigen::Vector3d s_prime = (d * s1_c / s1_len) + p;
+            s_prime_vectors.push_back(s_prime);
+        }
+
+        // Transform s′ vectors back to detector coordinates using Panel's get_ray_intersection
+        std::vector<std::pair<double, double>> detector_coords;
+        for (const auto &s_prime : s_prime_vectors) {
+            // Direct conversion from s′ vector to detector coordinates
+            // get_ray_intersection returns coordinates in mm
+            auto xy_mm_opt = panel.get_ray_intersection(s_prime);
+            if (!xy_mm_opt) {
+                continue;  // Skip this corner if no intersection
+            }
+            std::array<double, 2> xy_mm = *xy_mm_opt;
+
+            // Convert from mm to pixels using the new mm_to_px function
+            std::array<double, 2> xy_pixels = panel.mm_to_px(xy_mm[0], xy_mm[1]);
+
+            detector_coords.push_back({xy_pixels[0], xy_pixels[1]});
+        }
+
+        // Determine the bounding box in detector coordinates
+        // Find minimum and maximum coordinates from the four corners
+        auto [min_x_it, max_x_it] = std::minmax_element(
+          detector_coords.begin(),
+          detector_coords.end(),
+          [](const auto &a, const auto &b) { return a.first < b.first; });
+        auto [min_y_it, max_y_it] = std::minmax_element(
+          detector_coords.begin(),
+          detector_coords.end(),
+          [](const auto &a, const auto &b) { return a.second < b.second; });
+
+        BoundingBoxExtents bbox;
+        // Use floor/ceil as specified in the paper: xmin = floor(min([x1,x2,x3,x4]))
+        bbox.x_min = static_cast<int>(std::floor(min_x_it->first));
+        bbox.x_max = static_cast<int>(std::ceil(max_x_it->first));
+        bbox.y_min = static_cast<int>(std::floor(min_y_it->second));
+        bbox.y_max = static_cast<int>(std::ceil(max_y_it->second));
+
+        // Calculate the image range (z-direction) using mosaicity parameter Δm
+        // The extent in φ depends on the geometry factor ζ = m₂ · e₁
+        double zeta = rot_axis.dot(e1);
+        if (std::abs(zeta) > ZETA_TOLERANCE) {  // Avoid division by zero
+            // Convert angular extents to rotation angles: φ′ = φᶜ ± Δm/ζ
+            double phi_plus = phi_c + delta_m / zeta;
+            double phi_minus = phi_c - delta_m / zeta;
+
+            // Convert phi angles from radians to degrees before using scan parameters
+            double phi_plus_deg = phi_plus * 180.0 / M_PI;
+            double phi_minus_deg = phi_minus * 180.0 / M_PI;
+
+            // Transform rotation angles to image numbers using scan parameters
+            double z_plus =
+              image_range_start - 1 + ((phi_plus_deg - osc_start) / osc_width);
+            double z_minus =
+              image_range_start - 1 + ((phi_minus_deg - osc_start) / osc_width);
+
+            // Clamp to the actual image range and use floor/ceil for integer bounds
+            bbox.z_min =
+              std::max(image_range_start - 1,
+                       static_cast<int>(std::floor(std::min(z_plus, z_minus))));
+            bbox.z_max = std::min(
+              image_range_end, static_cast<int>(std::ceil(std::max(z_plus, z_minus))));
+        } else {
+            // Handle degenerate case where reflection is parallel to rotation axis
+            // In this case, the reflection spans the entire image range
+            bbox.z_min = image_range_start;
+            bbox.z_max = image_range_end;
+        }
+
+        extents.push_back(bbox);
+    }
+
+    return extents;
+}
diff --git a/integrator/extent.hpp b/integrator/extent.hpp
new file mode 100644
index 00000000..6b86f23c
--- /dev/null
+++ b/integrator/extent.hpp
@@ -0,0 +1,79 @@
+/**
+ * @file extent.hpp
+ * @brief Header for extent and bounding box algorithms
+ */
+
+#pragma once
+
+#include <Eigen/Dense>
+#include <dx2/beam.hpp>
+#include <dx2/experiment.hpp>
+#include <dx2/goniometer.hpp>
+#include <experimental/mdspan>
+#include <vector>
+
+// Define a 2D mdspan type alias for double precision
+using mdspan_2d_double =
+  std::experimental::mdspan<double, std::experimental::dextents<size_t, 2>>;
+
+/**
+ * @brief Structure to hold pixel coordinate extents for reflection bounding
+ * boxes.
+ *
+ * Contains the minimum and maximum bounds in detector pixel coordinates (x, y)
+ * and image numbers (z) that define the region of interest around each
+ * reflection for integration.
+ */
+struct BoundingBoxExtents {
+    // x and y may need to be a double for extent calculations
+    // signed as some values are negative, but this should be clamped later
+    int x_min, x_max;  ///< Detector x-pixel range (fast axis)
+    int y_min, y_max;  ///< Detector y-pixel range (slow axis)
+    int z_min, z_max;  ///< Image number range (rotation axis)
+};
+
+/**
+ * @brief Compute bounding box extents for reflection integration using the
+ * Kabsch coordinate system.
+ *
+ * 1. Calculates angular divergence parameters Δb and Δm
+ * 2. Projects these divergences onto the Kabsch coordinate system to find
+ *    the corners of the integration region in reciprocal space
+ * 3. Transforms these reciprocal space coordinates back to detector pixel
+ *    coordinates and image numbers to define practical bounding boxes
+ *
+ * The method accounts for the non-orthonormal nature of the Kabsch basis
+ * and ensures that the bounding boxes encompass the full extent of each
+ * reflection's diffraction profile.
+ *
+ * @param s0 Incident beam vector (s₀), units of 1/Å
+ * @param rot_axis Unit goniometer rotation axis vector (m₂)
+ * @param s1_vectors Matrix of predicted s₁ vectors for all reflections,
+ *                   shape (num_reflections, 3)
+ * @param phi_positions Matrix containing reflection positions, where the third
+ *                      column contains φᶜ values in radians
+ * @param num_reflections Number of reflections to process
+ * @param sigma_b Beam divergence standard deviation in RADIANS (σb)
+ * @param sigma_m Mosaicity standard deviation in RADIANS (σm)
+ * @param panel Detector panel object for coordinate transformations
+ * @param scan Scan object containing oscillation and image range information
+ * @param beam Beam object for wavelength and other beam properties
+ * @param n_sigma Number of standard deviations to include in the bounding box
+ *                (default: 3.0)
+ * @param sigma_b_multiplier Additional multiplier for beam divergence
+ *                           (default: 2.0, called 'm' in DIALS)
+ * @return Vector of BoundingBoxExtents structures, one per reflection
+ */
+std::vector<BoundingBoxExtents> compute_kabsch_bounding_boxes(
+  const Eigen::Vector3d &s0,
+  const Eigen::Vector3d &rot_axis,
+  const mdspan_2d_double &s1_vectors,
+  const mdspan_2d_double &phi_positions,
+  const size_t num_reflections,
+  const double sigma_b,
+  const double sigma_m,
+  const Panel &panel,
+  const Scan &scan,
+  const MonochromaticBeam &beam,
+  const double n_sigma = 3.0,
+  const double sigma_b_multiplier = 2.0);
diff --git a/integrator/integrator.cc b/integrator/integrator.cc
index 4723c039..d81ae4e6 100644
--- a/integrator/integrator.cc
+++ b/integrator/integrator.cc
@@ -1,13 +1,20 @@
 /**
   * @file integrator.cc
+  * @brief Main application file for accelerated integration processing
+  *        containing the necessary data loading, argument parsing, 
+  *        threading, data preparation, and GPU kernel invocation in
+  *        order to perform GPU-accelerated integration processing.
  */
 
+#include "integrator.cuh"
+
 #include <bitshuffle.h>
 
 #include <Eigen/Dense>
 #include <atomic>
 #include <barrier>
 #include <chrono>
+#include <cmath>
 #include <csignal>
 #include <dx2/beam.hpp>
 #include <dx2/experiment.hpp>
@@ -27,9 +34,10 @@
 #include "common.hpp"
 #include "cuda_arg_parser.hpp"
 #include "cuda_common.hpp"
+#include "extent.hpp"
 #include "ffs_logger.hpp"
 #include "h5read.h"
-#include "math/device_precision.cuh"
+#include "kabsch.cuh"
 #include "math/math_utils.cuh"
 #include "math/vector3d.cuh"
 #include "predict.cc"
@@ -42,6 +50,26 @@ using namespace std::chrono_literals;
 using mdspan_2d =
   std::experimental::mdspan<scalar_t, std::experimental::dextents<size_t, 2>>;
 
+// Conversion helpers for Eigen to CUDA vector types
+/**
+ * @brief Convert Eigen::Vector3d to fastvec::Vector3D
+ */
+inline fastvec::Vector3D to_vector3d(const Eigen::Vector3d &v) {
+    return fastvec::make_vector3d(static_cast<scalar_t>(v.x()),
+                                  static_cast<scalar_t>(v.y()),
+                                  static_cast<scalar_t>(v.z()));
+}
+
+/**
+ * @brief Convert mdspan row to fastvec::Vector3D
+ */
+template <typename MdspanType>
+inline fastvec::Vector3D to_vector3d(const MdspanType &mdspan, size_t row) {
+    return fastvec::make_vector3d(static_cast<scalar_t>(mdspan(row, 0)),
+                                  static_cast<scalar_t>(mdspan(row, 1)),
+                                  static_cast<scalar_t>(mdspan(row, 2)));
+}
+
 // Global stop token for picking up user cancellation
 std::stop_source global_stop;
 
@@ -343,28 +371,6 @@ int main(int argc, char **argv) {
 
 #pragma endregion Predict or extract predictions
 
-    // TODO: Improve this hacky conversion from double to scalar_t (float)
-    std::vector<scalar_t> s1_vectors_converted_data(s1_vectors.extent(0) * 3);
-    mdspan_type<scalar_t> s1_vectors_converted(
-      s1_vectors_converted_data.data(), s1_vectors.extent(0), 3);
-    //size_t num_reflections = s1_vectors.extent(0);
-
-    // Direct pointer access conversion loop -> compiler should optimize this
-    const double *src = s1_vectors.data_handle();
-    scalar_t *dst = s1_vectors_converted_data.data();
-    for (size_t i = 0; i < num_reflections * 3; ++i) {
-        dst[i] = static_cast<scalar_t>(src[i]);
-    }
-
-    std::vector<scalar_t> phi_positions_converted_data(phi_column.extent(0));
-    // mdspan_type<scalar_t> phi_positions_converted(phi_positions_converted_data.data(), phi_column.extent(0));
-    // Direct pointer access conversion loop -> compiler should optimize this
-    src = phi_column.data_handle();
-    dst = phi_positions_converted_data.data();
-    for (size_t i = 0; i < phi_column.extent(0); ++i) {
-        dst[i] = static_cast<scalar_t>(src[i]);
-    }
-
 #pragma region Image Reading and Threading
     // Now set up for multi-threaded image reading and processing
     logger.info("Setting up image reading and threading");
@@ -379,67 +385,67 @@ int main(int argc, char **argv) {
     // Call the CPU-based extent function
     // Note: compute_kabsch_bounding_boxes expects double precision mdspan,
     // so we pass the original double precision s1_vectors and phi_column
-    // std::vector<BoundingBoxExtents> computed_bboxes =
-    //   compute_kabsch_bounding_boxes(s0,
-    //                                 rotation_axis,
-    //                                 s1_vectors,
-    //                                 phi_column,
-    //                                 num_reflections,
-    //                                 sigma_b,
-    //                                 sigma_m,
-    //                                 panel,
-    //                                 scan,
-    //                                 beam);
+    std::vector<BoundingBoxExtents> computed_bboxes =
+      compute_kabsch_bounding_boxes(s0,
+                                    rotation_axis,
+                                    s1_vectors,
+                                    phi_column,
+                                    num_reflections,
+                                    sigma_b,
+                                    sigma_m,
+                                    panel,
+                                    scan,
+                                    beam);
 
     logger.info("Bounding box computation completed");
 
     // Convert BoundingBoxExtents to flat array format for storage
-    // std::vector<double> computed_bbox_data(num_reflections * 6);
-    // for (size_t i = 0; i < num_reflections; ++i) {
-    //     const int step = 6 * i;
-    //     computed_bbox_data[step + 0] = computed_bboxes[i].x_min;
-    //     computed_bbox_data[step + 1] = computed_bboxes[i].x_max;
-    //     computed_bbox_data[step + 2] = computed_bboxes[i].y_min;
-    //     computed_bbox_data[step + 3] = computed_bboxes[i].y_max;
-    //     computed_bbox_data[step + 4] = static_cast<double>(computed_bboxes[i].z_min);
-    //     computed_bbox_data[step + 5] = static_cast<double>(computed_bboxes[i].z_max);
-    // }
+    std::vector<double> computed_bbox_data(num_reflections * 6);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        const int step = 6 * i;
+        computed_bbox_data[step + 0] = computed_bboxes[i].x_min;
+        computed_bbox_data[step + 1] = computed_bboxes[i].x_max;
+        computed_bbox_data[step + 2] = computed_bboxes[i].y_min;
+        computed_bbox_data[step + 3] = computed_bboxes[i].y_max;
+        computed_bbox_data[step + 4] = static_cast<double>(computed_bboxes[i].z_min);
+        computed_bbox_data[step + 5] = static_cast<double>(computed_bboxes[i].z_max);
+    }
 
     // Map reflections by z layer (image number)
-    // logger.info("Mapping reflections by image number (z layer)");
-    // std::unordered_map<int, std::vector<size_t>> reflections_by_image;
+    logger.info("Mapping reflections by image number (z layer)");
+    std::unordered_map<int, std::vector<size_t>> reflections_by_image;
 
-    // for (size_t refl_id = 0; refl_id < num_reflections; ++refl_id) {
-    //     const auto &bbox = computed_bboxes[refl_id];
+    for (size_t refl_id = 0; refl_id < num_reflections; ++refl_id) {
+        const auto &bbox = computed_bboxes[refl_id];
 
-    //     // Add this reflection to all images it spans
-    //     for (int z = bbox.z_min; z <= bbox.z_max; ++z) {
-    //         reflections_by_image[z].push_back(refl_id);
-    //     }
-    // }
+        // Add this reflection to all images it spans
+        for (int z = bbox.z_min; z <= bbox.z_max; ++z) {
+            reflections_by_image[z].push_back(refl_id);
+        }
+    }
 
-    // logger.info("Reflections mapped across {} unique images",
-    //             reflections_by_image.size());
+    logger.info("Reflections mapped across {} unique images",
+                reflections_by_image.size());
 
     // Log some statistics about the mapping
-    // if (!reflections_by_image.empty()) {
-    //     size_t min_refls_per_image = std::numeric_limits<size_t>::max();
-    //     size_t max_refls_per_image = 0;
-    //     size_t total_refls = 0;
-
-    //     for (const auto &[image, refls] : reflections_by_image) {
-    //         min_refls_per_image = std::min(min_refls_per_image, refls.size());
-    //         max_refls_per_image = std::max(max_refls_per_image, refls.size());
-    //         total_refls += refls.size();
-    //     }
-
-    //     double avg_refls_per_image =
-    //       static_cast<double>(total_refls) / reflections_by_image.size();
-    //     logger.info("Reflections per image: min={}, max={}, avg={:.1f}",
-    //                 min_refls_per_image,
-    //                 max_refls_per_image,
-    //                 avg_refls_per_image);
-    // }
+    if (!reflections_by_image.empty()) {
+        size_t min_refls_per_image = std::numeric_limits<size_t>::max();
+        size_t max_refls_per_image = 0;
+        size_t total_refls = 0;
+
+        for (const auto &[image, refls] : reflections_by_image) {
+            min_refls_per_image = std::min(min_refls_per_image, refls.size());
+            max_refls_per_image = std::max(max_refls_per_image, refls.size());
+            total_refls += refls.size();
+        }
+
+        double avg_refls_per_image =
+          static_cast<double>(total_refls) / reflections_by_image.size();
+        logger.info("Reflections per image: min={}, max={}, avg={:.1f}",
+                    min_refls_per_image,
+                    max_refls_per_image,
+                    avg_refls_per_image);
+    }
 
     // Get threading parameters
     uint32_t num_cpu_threads = parser.get<uint32_t>("threads");
@@ -480,17 +486,89 @@ int main(int argc, char **argv) {
 
     double time_waiting_for_images = 0.0;
 
+#pragma region Prep GPU Data Buffers
+    // Get detector d_matrix and flatten for GPU
+    Eigen::Matrix3d d_matrix_eigen = panel.get_d_matrix();
+    std::vector<scalar_t> d_matrix_flat(9);
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            d_matrix_flat[i * 3 + j] = static_cast<scalar_t>(d_matrix_eigen(i, j));
+        }
+    }
+
+    // Convert s1_vectors to Vector3D array for GPU
+    std::vector<fastvec::Vector3D> s1_vectors_vec(num_reflections);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        s1_vectors_vec[i] = to_vector3d(s1_vectors, i);
+    }
+
+    // phi_positions_converted_data already contains phi values (column index 2 of xyzcal.mm)
+    // Need to extract just the phi component (3rd column)
+    std::vector<scalar_t> phi_values_vec(num_reflections);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        phi_values_vec[i] = static_cast<scalar_t>(phi_column(i, 2));
+    }
+
+    // Allocate and copy GPU buffers (shared across all threads)
+    DeviceBuffer<scalar_t> d_d_matrix(9);
+    DeviceBuffer<fastvec::Vector3D> d_s1_vectors(num_reflections);
+    DeviceBuffer<scalar_t> d_phi_values(num_reflections);
+    DeviceBuffer<BoundingBoxExtents> d_bboxes(num_reflections);
+
+    d_d_matrix.assign(d_matrix_flat.data());
+    d_s1_vectors.assign(s1_vectors_vec.data());
+    d_phi_values.assign(phi_values_vec.data());
+    d_bboxes.assign(computed_bboxes.data());
+
+    // Convert beam parameters to Vector3D
+    fastvec::Vector3D s0_vec = to_vector3d(s0);
+    fastvec::Vector3D rot_axis_vec = to_vector3d(rotation_axis);
+    scalar_t wavelength = static_cast<scalar_t>(wl);
+    scalar_t osc_start_scalar = static_cast<scalar_t>(osc_start);
+    scalar_t osc_width_scalar = static_cast<scalar_t>(osc_width);
+
+    // Summation integration parameters
+    // delta_b and delta_m define the foreground ellipsoid extent in Kabsch space
+    constexpr scalar_t n_sigma = 3.0;  // Default number of sigma for foreground cutoff
+    const scalar_t delta_b = n_sigma * static_cast<scalar_t>(sigma_b);
+    const scalar_t delta_m = n_sigma * static_cast<scalar_t>(sigma_m);
+    logger.info("Summation integration: delta_b={:.6f}, delta_m={:.6f} (n_sigma={})",
+                delta_b,
+                delta_m,
+                n_sigma);
+
+    // Allocate accumulator buffers for summation integration (shared across all threads)
+    // These persist across all images and accumulate atomically
+    DeviceBuffer<accumulator_t> d_foreground_sum(num_reflections);
+    DeviceBuffer<uint32_t> d_foreground_count(num_reflections);
+    DeviceBuffer<accumulator_t> d_background_sum(num_reflections);
+    DeviceBuffer<uint32_t> d_background_count(num_reflections);
+
+    // Zero-initialize the accumulator buffers
+    cudaMemset(d_foreground_sum.data(), 0, num_reflections * sizeof(accumulator_t));
+    cudaMemset(d_foreground_count.data(), 0, num_reflections * sizeof(uint32_t));
+    cudaMemset(d_background_sum.data(), 0, num_reflections * sizeof(accumulator_t));
+    cudaMemset(d_background_count.data(), 0, num_reflections * sizeof(uint32_t));
+    cuda_throw_error();
+#pragma endregion Prep GPU Data Buffers
+
+#pragma region Thread launch
+    logger.info("Starting image reading and processing threads");
     // Spawn the reader threads
     std::vector<std::jthread> threads;
     for (int thread_id = 0; thread_id < num_cpu_threads; ++thread_id) {
         threads.emplace_back([&, thread_id]() {
             auto stop_token = global_stop.get_token();
+            CudaStream stream;  // Per-thread CUDA stream
 
-            // Full image buffers for decompression
-            auto decompressed_image = make_cuda_pinned_malloc<pixel_t>(width * height);
+            // Full image buffers for decompression (pinned memory for faster transfer)
+            auto host_image = make_cuda_pinned_malloc<pixel_t>(width * height);
             auto raw_chunk_buffer =
               std::vector<uint8_t>(width * height * sizeof(pixel_t));
 
+            // Device memory for GPU processing
+            auto device_image = PitchedMalloc<pixel_t>(width, height);
+
             // Let all threads do setup tasks before reading starts
             cpu_sync.arrive_and_wait();
 
@@ -503,11 +581,11 @@ int main(int argc, char **argv) {
                 }
 
                 // Check if this image has any reflections
-                // if (reflections_by_image.find(image_num)
-                //     == reflections_by_image.end()) {
-                //     completed_images += 1;
-                //     continue;
-                // }
+                if (reflections_by_image.find(image_num)
+                    == reflections_by_image.end()) {
+                    completed_images += 1;
+                    continue;
+                }
 
                 {
                     std::scoped_lock lock(reader_mutex);
@@ -561,11 +639,11 @@ int main(int argc, char **argv) {
                     break;
                 }
 
-                // Decompress the data
+                // Decompress the data into pinned host memory
                 switch (reader.get_raw_chunk_compression()) {
                 case Reader::ChunkCompression::BITSHUFFLE_LZ4:
                     bshuf_decompress_lz4(buffer.data() + 12,
-                                         decompressed_image.get(),
+                                         host_image.get(),
                                          width * height,
                                          sizeof(pixel_t),
                                          0);
@@ -573,14 +651,57 @@ int main(int argc, char **argv) {
                 case Reader::ChunkCompression::BYTE_OFFSET_32:
                     decompress_byte_offset<pixel_t>(
                       buffer,
-                      {decompressed_image.get(),
+                      {host_image.get(),
                        static_cast<std::span<pixel_t>::size_type>(width * height)});
                     break;
                 }
 
-                // TODO: image processing here
-                // decompressed_image.get() contains the decompressed pixel data (width * height pixels)
-                // reflections_by_image[image_num] contains the reflection IDs for this image
+                // Copy image data to device memory
+                cudaMemcpy2DAsync(device_image.get(),
+                                  device_image.pitch_bytes(),
+                                  host_image.get(),
+                                  width * sizeof(pixel_t),
+                                  width * sizeof(pixel_t),
+                                  height,
+                                  cudaMemcpyHostToDevice,
+                                  stream);
+                cuda_throw_error();
+
+                // Get reflection indices for this image and copy to device
+                const auto &refl_indices = reflections_by_image[image_num];
+                size_t num_refls_this_image = refl_indices.size();
+
+                // Allocate per-image device buffer for reflection indices
+                DeviceBuffer<size_t> d_reflection_indices(num_refls_this_image);
+                d_reflection_indices.assign(refl_indices.data());
+
+                // Launch Kabsch transform kernel for this image
+                // This computes Kabsch coordinates and atomically accumulates
+                // foreground/background intensities for summation integration
+                compute_kabsch_transform(device_image.get(),
+                                         device_image.pitch_bytes(),
+                                         width,
+                                         height,
+                                         image_num,
+                                         d_d_matrix.data(),
+                                         wavelength,
+                                         osc_start_scalar,
+                                         osc_width_scalar,
+                                         image_range_start,
+                                         s0_vec,
+                                         rot_axis_vec,
+                                         d_s1_vectors.data(),
+                                         d_phi_values.data(),
+                                         d_bboxes.data(),
+                                         d_reflection_indices.data(),
+                                         num_refls_this_image,
+                                         delta_b,
+                                         delta_m,
+                                         d_foreground_sum.data(),
+                                         d_foreground_count.data(),
+                                         d_background_sum.data(),
+                                         d_background_count.data(),
+                                         stream);
 
                 logger.trace("Thread {} loaded image {}", thread_id, image_num);
                 completed_images += 1;
@@ -610,8 +731,457 @@ int main(int argc, char **argv) {
         logger.info("Total time waiting for images: {:.2f} s", time_waiting_for_images);
     }
 
+#pragma region Summation Integration Finalization
+    // Host-side reduction and finalization
+    // Copy accumulator buffers back from GPU and compute final intensities
+    logger.info("Finalizing summation integration for {} reflections", num_reflections);
+
+    // Allocate host buffers for results
+    std::vector<accumulator_t> h_foreground_sum(num_reflections);
+    std::vector<uint32_t> h_foreground_count(num_reflections);
+    std::vector<accumulator_t> h_background_sum(num_reflections);
+    std::vector<uint32_t> h_background_count(num_reflections);
+
+    // Copy results from device to host
+    d_foreground_sum.extract(h_foreground_sum.data());
+    d_foreground_count.extract(h_foreground_count.data());
+    d_background_sum.extract(h_background_sum.data());
+    d_background_count.extract(h_background_count.data());
+
+    // Compute final intensities for each reflection
+    std::vector<double> intensities(num_reflections);
+    std::vector<double> variances(num_reflections);
+    std::vector<double> sigmas(num_reflections);       // σ(I) = √Var(I)
+    std::vector<double> backgrounds(num_reflections);  // b̄  (background mean per pixel)
+    std::vector<double> background_sigmas(num_reflections);  // σ(b̄)
+    size_t valid_reflections = 0;
+
+    for (size_t i = 0; i < num_reflections; ++i) {
+        uint32_t fg_count = h_foreground_count[i];
+        uint32_t bg_count = h_background_count[i];
+
+        if (fg_count == 0) {
+            // No foreground pixels - reflection not measured
+            intensities[i] = 0.0;
+            variances[i] = -1.0;  // Flag as invalid
+            sigmas[i] = -1.0;
+            backgrounds[i] = 0.0;
+            background_sigmas[i] = -1.0;
+            continue;
+        }
+
+        double fg_sum = h_foreground_sum[i];
+
+        // Background mean b̄ (simple mean for now
+        double bg_mean = (bg_count > 0) ? h_background_sum[i] / bg_count : 0.0;
+
+        // Subtract background:  I = Σcᵢ(fg) − n_fg · b̄
+        double background_total = bg_mean * fg_count;
+        double intensity = fg_sum - background_total;
+
+        // Variance estimation
+        //
+        //   I = Σcᵢ(fg) - B,  where B = n_fg · b̄  (total background under foreground)
+        //
+        //   Var(I) = |I| + |B| · (1 + n_fg / n_bg)
+        //
+        // The |I| term is the Poisson variance of the net signal.
+        // The |B| term accounts for the Poisson variance of the background,
+        // with the (1 + n_fg/n_bg) factor propagating the uncertainty in
+        // the background estimate from n_bg pixels to n_fg pixels.
+        //
+        double fg_bg_ratio =
+          (bg_count > 0) ? static_cast<double>(fg_count) / bg_count : 0.0;
+        double variance =
+          std::abs(intensity) + std::abs(background_total) * (1.0 + fg_bg_ratio);
+        double bg_variance = std::abs(background_total) * (1.0 + fg_bg_ratio);
+
+        intensities[i] = intensity;
+        variances[i] = variance;
+        sigmas[i] = (variance > 0.0) ? std::sqrt(variance) : 0.0;
+        backgrounds[i] = bg_mean;
+        background_sigmas[i] = (bg_variance > 0.0) ? std::sqrt(bg_variance) : 0.0;
+        valid_reflections++;
+    }
+
+    logger.info("Summation integration complete: {} valid reflections out of {}",
+                valid_reflections,
+                num_reflections);
+
+    // Log some statistics
+    if (valid_reflections > 0) {
+        double sum_intensity = 0.0;
+        double sum_sigma = 0.0;
+        double sum_bg = 0.0;
+        double max_intensity = -std::numeric_limits<double>::infinity();
+        double min_intensity = std::numeric_limits<double>::infinity();
+        size_t n_positive_isigma = 0;
+        double sum_i_over_sigma = 0.0;
+        for (size_t i = 0; i < num_reflections; ++i) {
+            if (variances[i] >= 0) {
+                sum_intensity += intensities[i];
+                sum_sigma += sigmas[i];
+                sum_bg += backgrounds[i];
+                max_intensity = std::max(max_intensity, intensities[i]);
+                min_intensity = std::min(min_intensity, intensities[i]);
+                if (sigmas[i] > 0.0) {
+                    sum_i_over_sigma += intensities[i] / sigmas[i];
+                    n_positive_isigma++;
+                }
+            }
+        }
+        logger.info("Intensity statistics: min={:.1f}, max={:.1f}, mean={:.1f}",
+                    min_intensity,
+                    max_intensity,
+                    sum_intensity / valid_reflections);
+        logger.info("Mean σ(I)={:.2f}, mean background={:.2f}",
+                    sum_sigma / valid_reflections,
+                    sum_bg / valid_reflections);
+        if (n_positive_isigma > 0) {
+            logger.info("Mean I/σ(I)={:.2f} ({} reflections with σ>0)",
+                        sum_i_over_sigma / n_positive_isigma,
+                        n_positive_isigma);
+        }
+    }
+
+#pragma endregion Summation Integration Finalization
+
 #pragma endregion Image Reading and Threading
 
     return 0;
 }
+#pragma Thread launch
 #pragma endregion Application Entry
+
+#pragma region Bbox computation
+/*
+    // Compute new bounding boxes using Kabsch coordinate system
+
+    logger.info("Computing new Kabsch bounding boxes for {} reflections",
+                num_reflections);
+
+    // Create output array for bounding boxes (6 values per reflection)
+    std::vector<scalar_t> computed_bbox_data(num_reflections * 6);
+
+    // TODO: Make these conversions better
+
+    // Convert s1_vectors from double to scalar_t (float) format
+    std::vector<fastvec::Vector3D> s1_vectors_converted(num_reflections);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        s1_vectors_converted[i] =
+          fastvec::make_vector3d(static_cast<scalar_t>(s1_vectors(i, 0)),
+                                 static_cast<scalar_t>(s1_vectors(i, 1)),
+                                 static_cast<scalar_t>(s1_vectors(i, 2)));
+    }
+
+    // Convert phi values from double to scalar_t
+    std::vector<scalar_t> phi_values_converted(num_reflections);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        phi_values_converted[i] =
+          static_cast<scalar_t>(phi_column(i, 2));  // Extract phi (z-component)
+    }
+
+    // Extract detector parameters (assuming you add accessors to Panel class)
+    DetectorParameters detector_params;
+    auto pixel_size_array = panel.get_pixel_size();
+    detector_params.pixel_size[0] = static_cast<scalar_t>(pixel_size_array[0]);
+    detector_params.pixel_size[1] = static_cast<scalar_t>(pixel_size_array[1]);
+
+    // TODO: Add these accessors to Panel class
+    detector_params.parallax_correction = panel.has_parallax_correction();
+    detector_params.mu = static_cast<scalar_t>(panel.get_mu());
+    detector_params.thickness = static_cast<scalar_t>(panel.get_thickness());
+
+    // Convert geometry vectors to CUDA format
+    auto fast_axis_eigen = panel.get_fast_axis();
+    auto slow_axis_eigen = panel.get_slow_axis();
+    auto origin_eigen = panel.get_origin();
+
+    detector_params.fast_axis =
+      fastvec::make_vector3d(static_cast<scalar_t>(fast_axis_eigen.x()),
+                             static_cast<scalar_t>(fast_axis_eigen.y()),
+                             static_cast<scalar_t>(fast_axis_eigen.z()));
+    detector_params.slow_axis =
+      fastvec::make_vector3d(static_cast<scalar_t>(slow_axis_eigen.x()),
+                             static_cast<scalar_t>(slow_axis_eigen.y()),
+                             static_cast<scalar_t>(slow_axis_eigen.z()));
+    detector_params.origin =
+      fastvec::make_vector3d(static_cast<scalar_t>(origin_eigen.x()),
+                             static_cast<scalar_t>(origin_eigen.y()),
+                             static_cast<scalar_t>(origin_eigen.z()));
+
+    // Get D-matrix inverse
+    auto d_matrix_inv = panel.get_d_matrix().inverse();
+    std::vector<scalar_t> d_matrix_inv_flat(9);
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            d_matrix_inv_flat[i * 3 + j] = static_cast<scalar_t>(d_matrix_inv(i, j));
+        }
+    }
+
+    // Compute new bounding boxes using CUDA with proper detector parameters
+    compute_bbox_extent(
+      s1_vectors_converted.data(),
+      phi_values_converted.data(),
+      fastvec::make_vector3d(s0.x(), s0.y(), s0.z()),
+      fastvec::make_vector3d(rotation_axis.x(), rotation_axis.y(), rotation_axis.z()),
+      sigma_b,
+      sigma_m,
+      static_cast<scalar_t>(osc_start),
+      static_cast<scalar_t>(osc_width),
+      image_range_start,
+      scan.get_image_range()[1],  // image_range_end
+      static_cast<scalar_t>(beam.get_wavelength()),
+      d_matrix_inv_flat.data(),
+      detector_params.pixel_size,
+      detector_params.parallax_correction,
+      detector_params.mu,
+      detector_params.thickness,
+      detector_params.fast_axis,
+      detector_params.slow_axis,
+      detector_params.origin,
+      computed_bbox_data.data(),
+      num_reflections);
+
+    // Convert to reflection table format for comparison
+    // Data is already in the correct flat format from compute_bbox_extent
+    logger.info("Bounding box computation completed");
+
+    // Compare with existing bounding boxes
+    logger.info("Comparing computed bounding boxes with existing bbox column");
+    logger.trace("First 5 bounding box comparisons:");
+    for (size_t i = 0; i < std::min<size_t>(5, num_reflections); ++i) {
+        // Existing bbox
+        double ex_x_min = bbox_column(i, 0);
+        double ex_x_max = bbox_column(i, 1);
+        double ex_y_min = bbox_column(i, 2);
+        double ex_y_max = bbox_column(i, 3);
+        int ex_z_min = static_cast<int>(bbox_column(i, 4));
+        int ex_z_max = static_cast<int>(bbox_column(i, 5));
+
+        // Computed bbox (from flat array)
+        double comp_x_min = computed_bbox_data[i * 6 + 0];
+        double comp_x_max = computed_bbox_data[i * 6 + 1];
+        double comp_y_min = computed_bbox_data[i * 6 + 2];
+        double comp_y_max = computed_bbox_data[i * 6 + 3];
+        int comp_z_min = static_cast<int>(computed_bbox_data[i * 6 + 4]);
+        int comp_z_max = static_cast<int>(computed_bbox_data[i * 6 + 5]);
+
+        logger.trace("bbox[{}]: existing x=[{:.1f},{:.1f}] y=[{:.1f},{:.1f}] z=[{},{}]",
+                     i,
+                     ex_x_min,
+                     ex_x_max,
+                     ex_y_min,
+                     ex_y_max,
+                     ex_z_min,
+                     ex_z_max);
+        logger.trace("bbox[{}]: computed x=[{:.1f},{:.1f}] y=[{:.1f},{:.1f}] z=[{},{}]",
+                     i,
+                     comp_x_min,
+                     comp_x_max,
+                     comp_y_min,
+                     comp_y_max,
+                     comp_z_min,
+                     comp_z_max);
+    }
+
+#pragma endregion Bbox computation
+#pragma region Kabsch tf
+
+    logger.info(
+      "Computing Kabsch coordinates for voxel centers within existing bounding boxes");
+
+    std::vector<double> voxel_kabsch_coords;  // ε₁, ε₂, ε₃ for each voxel center
+    std::vector<int> voxel_reflection_ids;    // Which reflection each voxel belongs to
+    std::vector<double> voxel_positions;      // x, y, z positions for each voxel center
+    std::vector<double> voxel_s1_lengths;     // |s₁| for each voxel center
+
+    // Convert global vectors to CUDA format once
+    fastvec::Vector3D s0_cuda = fastvec::make_vector3d(s0.x(), s0.y(), s0.z());
+    fastvec::Vector3D rotation_axis_cuda =
+      fastvec::make_vector3d(rotation_axis.x(), rotation_axis.y(), rotation_axis.z());
+
+    // Process each reflection's existing bounding box
+    for (size_t refl_id = 0; refl_id < num_reflections; ++refl_id) {
+        // Extract bounding box from existing column (format: x_min, x_max, y_min, y_max, z_min, z_max)
+        double x_min = bbox_column(refl_id, 0);
+        double x_max = bbox_column(refl_id, 1);
+        double y_min = bbox_column(refl_id, 2);
+        double y_max = bbox_column(refl_id, 3);
+        int z_min = static_cast<int>(bbox_column(refl_id, 4));
+        int z_max = static_cast<int>(bbox_column(refl_id, 5));
+
+        // Debug: Calculate expected voxel count
+        int x_count = static_cast<int>(x_max) - static_cast<int>(x_min);
+        int y_count = static_cast<int>(y_max) - static_cast<int>(y_min);
+        int z_count = z_max - z_min;
+        int expected_voxels = x_count * y_count * z_count;
+
+        logger.trace(
+          "Reflection {}: bbox x=[{:.1f},{:.1f}] y=[{:.1f},{:.1f}] z=[{},{}]",
+          refl_id,
+          x_min,
+          x_max,
+          y_min,
+          y_max,
+          z_min,
+          z_max);
+        logger.trace("Expected voxels: {} x {} x {} = {}",
+                     x_count,
+                     y_count,
+                     z_count,
+                     expected_voxels);
+
+        // Get reflection centroid data
+        Eigen::Vector3d s1_c_eigen(
+          s1_vectors(refl_id, 0), s1_vectors(refl_id, 1), s1_vectors(refl_id, 2));
+        fastvec::Vector3D s1_c_cuda =
+          fastvec::make_vector3d(s1_c_eigen.x(), s1_c_eigen.y(), s1_c_eigen.z());
+        double phi_c = phi_column(refl_id, 2);
+
+        logger.trace(
+          "Processing reflection {} with existing bbox x=[{:.1f},{:.1f}] "
+          "y=[{:.1f},{:.1f}] z=[{},{}]",
+          refl_id,
+          x_min,
+          x_max,
+          y_min,
+          y_max,
+          z_min,
+          z_max);
+
+        // Collect all voxel data for this reflection for batch processing
+        std::vector<fastvec::Vector3D> batch_s_pixels;
+        std::vector<scalar_t> batch_phi_pixels;
+        std::vector<std::tuple<double, double, double>> batch_voxel_coords;
+
+        // Iterate through all voxel centers in the bounding box
+        for (int z = z_min; z < z_max; ++z) {
+            double phi_pixel =
+              osc_start + (z - image_range_start + 1.5) * osc_width / 180.0 * M_PI;
+
+            for (int y = static_cast<int>(y_min); y < static_cast<int>(y_max); ++y) {
+                for (int x = static_cast<int>(x_min); x < static_cast<int>(x_max);
+                     ++x) {
+                    // Use voxel center coordinates
+                    // double voxel_x = x + 0.5;
+                    // double voxel_y = y + 0.5;
+                    double voxel_z = 1;  //z + 0.5;
+
+                    std::array<double, 2> xy_mm = panel.px_to_mm(
+                      static_cast<double>(x) + 0.5, static_cast<double>(y) + 0.5);
+
+                    double voxel_x = xy_mm[0];
+                    double voxel_y = xy_mm[1];
+
+                    // Convert detector coordinates to lab coordinate using panel geometry
+                    Vector3d lab_coord =
+                      panel.get_d_matrix() * Vector3d(voxel_x, voxel_y, 1.0);
+
+                    // logger.trace(
+                    //   "Voxel center ({}, {}, {}): lab_coord = ({:.6f}, {:.6f}, {:.6f})",
+                    //   voxel_x,
+                    //   voxel_y,
+                    //   voxel_z,
+                    //   lab_coord.x(),
+                    //   lab_coord.y(),
+                    //   lab_coord.z());
+
+                    // Convert lab coordinate to reciprocal space vector
+                    Eigen::Vector3d s_pixel_eigen = lab_coord.normalized() / wl;
+                    fastvec::Vector3D s_pixel_cuda = fastvec::make_vector3d(
+                      s_pixel_eigen.x(), s_pixel_eigen.y(), s_pixel_eigen.z());
+
+                    // Store for batch processing
+                    batch_s_pixels.push_back(s_pixel_cuda);
+                    batch_phi_pixels.push_back(static_cast<scalar_t>(phi_pixel));
+                    batch_voxel_coords.emplace_back(voxel_x, voxel_y, voxel_z);
+                }
+            }
+        }
+
+        // Process all voxels for this reflection with CUDA if we have any
+        if (!batch_s_pixels.empty()) {
+            size_t num_voxels = batch_s_pixels.size();
+
+            // Output arrays
+            std::vector<fastvec::Vector3D> batch_eps_results(num_voxels);
+            std::vector<scalar_t> batch_s1_len_results(num_voxels);
+
+            // Call CUDA function for voxel processing
+            compute_kabsch_transform(batch_s_pixels.data(),
+                                     batch_phi_pixels.data(),
+                                     s1_c_cuda,
+                                     static_cast<scalar_t>(phi_c),
+                                     s0_cuda,
+                                     rotation_axis_cuda,
+                                     batch_eps_results.data(),
+                                     batch_s1_len_results.data(),
+                                     num_voxels);
+
+            // Store results
+            for (size_t i = 0; i < num_voxels; ++i) {
+                const auto &[voxel_x, voxel_y, voxel_z] = batch_voxel_coords[i];
+                const auto &eps = batch_eps_results[i];
+
+                voxel_kabsch_coords.insert(voxel_kabsch_coords.end(),
+                                           {eps.x, eps.y, eps.z});
+                voxel_reflection_ids.push_back(static_cast<int>(refl_id));
+                voxel_positions.insert(voxel_positions.end(),
+                                       {voxel_x, voxel_y, voxel_z});
+                voxel_s1_lengths.push_back(
+                  static_cast<double>(batch_s1_len_results[i]));
+            }
+        }
+    }
+
+    size_t actual_voxels = voxel_reflection_ids.size();
+    logger.info("Processed {} voxel centers, computed {} Kabsch coordinates",
+                actual_voxels,
+                voxel_kabsch_coords.size() / 3);
+
+    // Add debugging information
+    logger.info(
+      "Vector sizes: kabsch_coords={}, reflection_ids={}, positions={}, s1_lengths={}",
+      voxel_kabsch_coords.size(),
+      voxel_reflection_ids.size(),
+      voxel_positions.size(),
+      voxel_s1_lengths.size());
+
+    logger.info(
+      "Expected sizes: kabsch_coords={}, reflection_ids={}, positions={}, "
+      "s1_lengths={}",
+      actual_voxels * 3,
+      actual_voxels,
+      actual_voxels * 3,
+      actual_voxels);
+
+#pragma endregion Kabsch tf
+#pragma region Application Output
+
+    // Add computed bounding boxes to reflection table for comparison
+    // computed_bbox_data is already std::vector<double>
+    reflections.add_column("computed_bbox",
+                           std::vector<size_t>{num_reflections, 6},
+                           computed_bbox_data);
+
+    // Create voxel data table
+    ReflectionTable voxel_table;
+    voxel_table.add_column(
+      "kabsch_coordinates", std::vector<size_t>{actual_voxels, 3}, voxel_kabsch_coords);
+    voxel_table.add_column(
+      "reflection_id",
+      std::vector<size_t>{actual_voxels, 1},
+      std::vector<double>(voxel_reflection_ids.begin(), voxel_reflection_ids.end()));
+    voxel_table.add_column(
+      "pixel_coordinates", std::vector<size_t>{actual_voxels, 3}, voxel_positions);
+    voxel_table.add_column(
+      "voxel_s1_length", std::vector<size_t>{actual_voxels, 1}, voxel_s1_lengths);
+
+    // Write output files
+    reflections.write("output_reflections.h5");
+    voxel_table.write("voxel_kabsch_data.h5");
+
+    logger.info("Results saved to output_reflections.h5 and voxel_kabsch_data.h5");
+*/
+#pragma endregion Application Output
diff --git a/integrator/integrator.cu b/integrator/integrator.cu
new file mode 100644
index 00000000..cbf4c4b0
--- /dev/null
+++ b/integrator/integrator.cu
@@ -0,0 +1,5 @@
+/**
+  * @file integrator.cu
+ */
+
+#include "integrator.cuh"
\ No newline at end of file
diff --git a/integrator/integrator.cuh b/integrator/integrator.cuh
new file mode 100644
index 00000000..8cd5f72e
--- /dev/null
+++ b/integrator/integrator.cuh
@@ -0,0 +1,7 @@
+/**
+  * @file integrator.cuh
+ */
+
+#pragma once
+
+// Nothing yet
\ No newline at end of file
diff --git a/integrator/kabsch.cu b/integrator/kabsch.cu
new file mode 100644
index 00000000..493b62eb
--- /dev/null
+++ b/integrator/kabsch.cu
@@ -0,0 +1,601 @@
+/**
+ * @file kabsch.cu
+ * @brief CUDA implementation of Kabsch coordinate transformations for
+ * integration
+ * 
+ * This file implements GPU-accelerated coordinate transformations used
+ * in the Kabsch algorithm for data processing. The
+ * Kabsch transformation converts pixel coordinates from reciprocal
+ * space into a geometry-invariant local coordinate system, enabling
+ * efficient reflection profile integration and summation.
+ * 
+ * The coordinate system is defined by three basis vectors:
+ * - e₁: Perpendicular to the scattering plane (s₁ᶜ × s₀)
+ * - e₂: Within the scattering plane, orthogonal to e₁ (s₁ᶜ × e₁)  
+ * - e₃: Bisects the incident and diffracted beam directions (s₁ᶜ + s₀)
+ * 
+ * The resulting Kabsch coordinates (ε₁, ε₂, ε₃) represent:
+ * - ε₁: Displacement perpendicular to the scattering plane
+ * - ε₂: Displacement within the scattering plane (with rotation correction)
+ * - ε₃: Displacement along the rotation axis (scaled by geometry factor ζ)
+ * 
+ * This implementation uses modern C++ RAII principles with DeviceBuffer
+ * for automatic GPU memory management and the custom Vector3D class for
+ * mathematical operations. All vector operations are optimized for both
+ * host and device execution contexts.
+ *
+ * @author Dimitrios Vlachos
+ * @date 2025
+ * @see Vector3D for mathematical vector operations
+ * @see DeviceBuffer for GPU memory management
+ * 
+ * References:
+ * - Kabsch, W. (2010). Acta Cryst. D66, 133–144
+ *
+ * GOAL: Determine foreground/background pixels and atomically aggregate
+ * intensities per reflection within the kernel, avoiding the need to return
+ * all epsilon values (too much data).
+ * 
+ * TODO:
+ * 2. Fix unit test
+ *     - Remove deprecated kernel
+ *     - Refactor unit test to utilise the new kernel and host wrapper
+ * 
+ * 3. Dials-data?
+ */
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+
+#include "cuda_common.hpp"
+#include "device_common.cuh"
+#include "extent.hpp"
+#include "h5read.h"
+#include "integrator.cuh"
+#include "kabsch.cuh"
+#include "math/math_utils.cuh"
+
+using namespace fastvec;
+
+// Block dimensions for the Kabsch transform kernel.
+// Used by both the kernel (shared memory sizing) and host wrapper (grid launch).
+//
+// Each block of KABSCH_BLOCK_X × KABSCH_BLOCK_Y threads maps 1:1 to corners.
+// A pixel's voxel has 4 xy-corners: (px,py), (px+1,py), (px,py+1), (px+1,py+1).
+// All 4 must be in the same block's shared memory so the pixel thread can read
+// them after __syncthreads(). This means a block of 32×16 corners can only
+// service (32-1)×(16-1) = 31×15 complete pixels — the rightmost column and
+// bottom row of threads are "helper" corners that exist solely so the last
+// interior pixel has a right/bottom neighbour to read.
+//
+// The grid is launched with a stride of (BLOCK-1) so adjacent blocks overlap
+// by one column/row of corners.  That single shared column/row is computed
+// by both blocks (redundantly), which costs ~3-6% extra work but keeps
+// everything in fast shared memory within a single kernel launch.
+constexpr int KABSCH_BLOCK_X = 32;
+constexpr int KABSCH_BLOCK_Y = 16;
+
+// Maximum number of reflections that can overlap a single block's footprint
+constexpr size_t MAX_BLOCK_REFLECTIONS = 64;
+
+/**
+ * @brief Check if a pixel is in the foreground region of a reflection
+ *
+ * Uses the ellipsoid condition in Kabsch space to classify pixels:
+ *   ε₁²/δ_B² + ε₂²/δ_B² + ε₃²/δ_M² ≤ 1.0
+ *
+ * @param eps Kabsch coordinates (ε₁, ε₂, ε₃)
+ * @param delta_b Foreground extent in e₁/e₂ directions (n_sigma × σ_D)
+ * @param delta_m Foreground extent in e₃ direction (n_sigma × σ_M)
+ * @return true if pixel is foreground, false if background
+ */
+__device__ __forceinline__ bool is_foreground(const Vector3D &eps,
+                                              scalar_t delta_b,
+                                              scalar_t delta_m) {
+    // Precompute inverse squares to avoid repeated division
+    // Division is more expensive than multiplication on GPU
+    // -> Verify in profiler how compiler optimises this PTX
+    scalar_t inv_delta_b_sq = scalar_t(1.0) / (delta_b * delta_b);
+    scalar_t inv_delta_m_sq = scalar_t(1.0) / (delta_m * delta_m);
+
+    // Ellipsoid condition: ε₁²/δ_B² + ε₂²/δ_B² + ε₃²/δ_M² ≤ 1.0
+    scalar_t ellipsoid_val = (eps.x * eps.x + eps.y * eps.y) * inv_delta_b_sq
+                             + (eps.z * eps.z) * inv_delta_m_sq;
+
+    return ellipsoid_val <= scalar_t(1.0);
+}
+
+/**
+ * @brief Transform a pixel from reciprocal space into the local Kabsch
+   coordinate frame
+ * 
+ * Given a predicted reflection centre and a pixel's position in
+ * reciprocal space, this function calculates the local Kabsch
+ * coordinates (ε₁, ε₂, ε₃), which represent displacements along a
+ * non-orthonormal basis defined by the scattering geometry.
+ *
+ * This is used to determine whether a pixel falls within the profile of
+ * a reflection in Kabsch space, which allows summation or profile
+ * integration to proceed in a geometry-invariant coordinate frame.
+ *
+ * @param s0 Incident beam vector (s₀), units of 1/Å
+ * @param s1_c Predicted diffracted vector at the reflection centre (s₁ᶜ), units of 1/Å
+ * @param phi_c Rotation angle at the reflection centre (φᶜ), in radians
+ * @param s_pixel Diffracted vector at the current pixel (S′), units of 1/Å
+ * @param phi_pixel Rotation angle at the pixel (φ′), in radians
+ * @param rot_axis Unit goniometer rotation axis vector (m₂)
+ * @param s1_len_out Output parameter for magnitude of s₁ᶜ (|s₁|)
+ * @return Transformed coordinates in Kabsch space (ε₁, ε₂, ε₃)
+ */
+__device__ Vector3D pixel_to_kabsch(const Vector3D &s0,
+                                    const Vector3D &s1_c,
+                                    scalar_t phi_c,
+                                    const Vector3D &s_pixel,
+                                    scalar_t phi_pixel,
+                                    const Vector3D &rot_axis,
+                                    scalar_t &s1_len_out) {
+    // Define the local Kabsch basis vectors:
+
+    // e1 is perpendicular to the scattering plane
+    Vector3D e1 = normalized(cross(s1_c, s0));
+
+    // e2 lies within the scattering plane, orthogonal to e1
+    Vector3D e2 = normalized(cross(s1_c, e1));
+
+    // e3 bisects the angle between s0 and s1
+    Vector3D e3 = normalized(s1_c + s0);
+
+    // Compute the length of the predicted diffracted vector (|s₁|)
+    scalar_t s1_len = norm(s1_c);
+    s1_len_out = s1_len;
+
+    // Rotation offset between the pixel and reflection centre
+    scalar_t dphi = phi_pixel - phi_c;
+
+    // Compute the predicted diffracted vector at φ′
+    Vector3D s1_phi_prime = s1_c + e3 * dphi;
+
+    // Difference vector between pixel's s′ and the φ′-adjusted centroid
+    Vector3D deltaS = s_pixel - s1_phi_prime;
+
+    // ε₁: displacement along e1, normalised by |s₁|
+    scalar_t eps1 = dot(e1, deltaS) / s1_len;
+
+    // ε₂: displacement along e2, normalised by |s₁|
+    scalar_t eps2 = dot(e2, deltaS) / s1_len;
+
+    // ε₃: displacement along rotation axis, scaled by ζ = m₂ · e₁
+    scalar_t zeta = dot(rot_axis, e1);
+    scalar_t eps3 = zeta * dphi;
+
+    return make_vector3d(eps1, eps2, eps3);
+}
+
+/**
+ * @brief CUDA kernel with 8-corner voxel foreground classification
+ *
+ * Each thread represents a corner in the (width+1)×(height+1) corner grid.
+ * Blocks use overlapping tiles: KABSCH_BLOCK_X × KABSCH_BLOCK_Y threads
+ * cover that many corners, yielding (KABSCH_BLOCK_X-1) × (KABSCH_BLOCK_Y-1)
+ * complete pixels per block.  Adjacent blocks share one row/column of corners.
+ *
+ * Algorithm:
+ *   1. Every thread computes Kabsch coordinates for its corner at two φ values
+ *      (lower and upper z-faces of the voxel) and writes foreground flags to
+ *      shared memory.
+ *   2. After sync, interior threads (representing complete pixels) check all
+ *      8 voxel corners. If ANY corner is foreground, the pixel is foreground;
+ *      otherwise it's background.
+ *
+ * @param d_image Pointer to image data
+ * @param image_pitch Pitch of the image data in bytes
+ * @param width Image width in pixels
+ * @param height Image height in pixels
+ * @param image_num Current image number in the sequence
+ * @param d_matrix Detector coordinate transformation matrix (3x3)
+ * @param wavelength X-ray wavelength in Angstroms
+ * @param osc_start Starting oscillation angle in degrees
+ * @param osc_width Oscillation width per image in degrees
+ * @param image_range_start First image number in the range
+ * @param s0 Incident beam vector
+ * @param rot_axis Rotation axis vector
+ * @param d_s1_vectors Array of s1 vectors for each reflection
+ * @param d_phi_values Array of phi values for each reflection
+ * @param d_bboxes Array of bounding box structs for each reflection
+ * @param d_reflection_indices Indices of reflections touching this image
+ * @param num_reflections_this_image Number of reflections to process
+ * @param delta_b Foreground extent in e₁/e₂ directions (n_sigma × σ_D)
+ * @param delta_m Foreground extent in e₃ direction (n_sigma × σ_M)
+ * @param d_foreground_sum Output: accumulated foreground intensities per reflection
+ * @param d_foreground_count Output: foreground pixel counts per reflection
+ * @param d_background_sum Output: accumulated background intensities per reflection
+ * @param d_background_count Output: background pixel counts per reflection
+ */
+__global__ void kabsch_transform(pixel_t *d_image,
+                                 size_t image_pitch,
+                                 uint32_t width,
+                                 uint32_t height,
+                                 int image_num,
+                                 const scalar_t *d_matrix,
+                                 scalar_t wavelength,
+                                 scalar_t osc_start,
+                                 scalar_t osc_width,
+                                 int image_range_start,
+                                 Vector3D s0,
+                                 Vector3D rot_axis,
+                                 const Vector3D *d_s1_vectors,
+                                 const scalar_t *d_phi_values,
+                                 const BoundingBoxExtents *d_bboxes,
+                                 const size_t *d_reflection_indices,
+                                 size_t num_reflections_this_image,
+                                 // Summation integration parameters
+                                 scalar_t delta_b,
+                                 scalar_t delta_m,
+                                 // Output accumulators (atomically updated)
+                                 accumulator_t *d_foreground_sum,
+                                 uint32_t *d_foreground_count,
+                                 accumulator_t *d_background_sum,
+                                 uint32_t *d_background_count) {
+    // ── Shared Memory ──────────────────────────────────────────────
+    // Foreground status for each corner at three phi values:
+    //   [0] = phi_low, [1] = phi_high, [2] = phi_c (centre-in-slice override)
+    __shared__ bool s_fg[3][KABSCH_BLOCK_Y][KABSCH_BLOCK_X];
+
+    // Block-level reflection list: indices into the global reflection arrays
+    __shared__ size_t s_block_refl[MAX_BLOCK_REFLECTIONS];
+    __shared__ size_t s_num_block_refl;
+
+    const int tx = threadIdx.x;  // Thread-local x index (also indexes shared memory)
+    const int ty = threadIdx.y;  // Thread-local y index
+
+    // Global corner coordinates.
+    // The stride between block origins is (BLOCK-1), NOT BLOCK.  Compare:
+    //   Normal:  gx = blockIdx.x * blockDim.x + tx          (stride 32)
+    //   Here:    gx = blockIdx.x * (KABSCH_BLOCK_X - 1) + tx (stride 31)
+    //
+    // This means Block 0 covers corners 0-31, Block 1 covers 31-62, etc.
+    // Corner 31 is computed by BOTH blocks — that's the overlap.  It ensures
+    // pixel 30 (in Block 0) can read its right-side corner (31) from shared
+    // memory, and pixel 31 (in Block 1) can read its left-side corner (31)
+    // from its own block's shared memory.
+
+    const int gx = blockIdx.x * (KABSCH_BLOCK_X - 1) + tx;  // Global x coordinate
+    const int gy = blockIdx.y * (KABSCH_BLOCK_Y - 1) + ty;  // Global y coordinate
+
+    // ── Per-Corner Setup ────────────────────────────────────────────
+    // Two phi values define the z-extent of the voxel on this image
+    const scalar_t phi_low = degrees_to_radians(
+      osc_start + (static_cast<scalar_t>(image_num - image_range_start) * osc_width));
+    const scalar_t phi_high = degrees_to_radians(
+      osc_start
+      + (static_cast<scalar_t>(image_num - image_range_start + 1) * osc_width));
+
+    // s_pixel depends only on detector position (gx, gy), not phi
+    const bool corner_valid = (gx >= 0) && (gx <= static_cast<int>(width)) && (gy >= 0)
+                              && (gy <= static_cast<int>(height));
+
+    Vector3D s_pixel = make_vector3d(0, 0, 0);
+    if (corner_valid) {
+        Vector3D lab_coord =
+          make_vector3d(d_matrix[0] * gx + d_matrix[1] * gy + d_matrix[2],
+                        d_matrix[3] * gx + d_matrix[4] * gy + d_matrix[5],
+                        d_matrix[6] * gx + d_matrix[7] * gy + d_matrix[8]);
+        s_pixel = normalized(lab_coord) / wavelength;
+    }
+
+    // ── Block Reflection Filter ─────────────────────────────────────
+    // Thread 0 scans the image-level reflection list and keeps only those
+    // whose bbox overlaps this block's corner footprint.
+    if (tx == 0 && ty == 0) {
+        s_num_block_refl = 0;
+
+        const int bx_min = static_cast<int>(blockIdx.x) * (KABSCH_BLOCK_X - 1);
+        const int bx_max = bx_min + KABSCH_BLOCK_X - 1;
+        const int by_min = static_cast<int>(blockIdx.y) * (KABSCH_BLOCK_Y - 1);
+        const int by_max = by_min + KABSCH_BLOCK_Y - 1;
+
+        for (size_t i = 0; i < num_reflections_this_image; ++i) {
+            const size_t refl_idx = d_reflection_indices[i];
+            const BoundingBoxExtents &bbox = d_bboxes[refl_idx];
+
+            // Does the reflection's bbox overlap this block AND current image?
+            const bool overlaps =
+              (bx_max >= bbox.x_min && bx_min <= bbox.x_max)
+              && (by_max >= bbox.y_min && by_min <= bbox.y_max)
+              && (image_num >= bbox.z_min && image_num < bbox.z_max);
+
+            if (overlaps && s_num_block_refl < MAX_BLOCK_REFLECTIONS) {
+                s_block_refl[s_num_block_refl++] = refl_idx;
+            }
+        }
+    }
+
+    __syncthreads();
+
+    // Early exit if no reflections overlap this block
+    if (s_num_block_refl == 0) return;
+    // Pitched image accessor
+    size_t elem_pitch = image_pitch / sizeof(pixel_t);
+    PitchedArray2D<pixel_t> image(d_image, &elem_pitch);
+
+    // Does this thread represent a complete pixel?
+    // A pixel at (tx, ty) reads corners (tx, ty), (tx+1, ty), (tx, ty+1),
+    // (tx+1, ty+1) from shared memory.  For tx+1 and ty+1 to be valid
+    // indices, we need tx < 31 and ty < 15.  Threads on the right column
+    // (tx == 31) and bottom row (ty == 15) only contribute their corner
+    // during Phase 1 — they don't process a pixel in Phase 2.
+    const bool is_pixel_thread = (tx < KABSCH_BLOCK_X - 1) && (ty < KABSCH_BLOCK_Y - 1);
+    const bool pixel_in_image = is_pixel_thread && (gx < static_cast<int>(width))
+                                && (gy < static_cast<int>(height));
+
+    // ── Reflection Processing Loop ──────────────────────────────────
+    // Process each reflection: compute corner flags, sync, then accumulate pixels
+    for (size_t r = 0; r < s_num_block_refl; ++r) {
+        const size_t refl_idx = s_block_refl[r];
+        const Vector3D &s1_c = d_s1_vectors[refl_idx];
+        const scalar_t phi_c = d_phi_values[refl_idx];
+        const BoundingBoxExtents &bbox = d_bboxes[refl_idx];
+
+        // Every thread writes its corner's foreground flag to shared memory.
+        // Per-corner bbox check avoids needless Kabsch computation.
+        const bool in_bbox = corner_valid && (gx >= bbox.x_min && gx <= bbox.x_max)
+                             && (gy >= bbox.y_min && gy <= bbox.y_max);
+
+        if (in_bbox) {
+            scalar_t s1_len;  // unused here, required by pixel_to_kabsch
+
+            // Lower z-face (phi_low)
+            Vector3D eps_lo =
+              pixel_to_kabsch(s0, s1_c, phi_c, s_pixel, phi_low, rot_axis, s1_len);
+            s_fg[0][ty][tx] = is_foreground(eps_lo, delta_b, delta_m);
+
+            // Upper z-face (phi_high)
+            Vector3D eps_hi =
+              pixel_to_kabsch(s0, s1_c, phi_c, s_pixel, phi_high, rot_axis, s1_len);
+            s_fg[1][ty][tx] = is_foreground(eps_hi, delta_b, delta_m);
+
+            // Centre z-slice: evaluate at phi_c where ε₃ = 0
+            // This catches reflections entirely contained within the voxel's
+            // z-extent — the xy ellipsoid check still applies, so only pixels
+            // spatially close to the reflection centre are marked foreground.
+            const bool centre_in_z_slice = (phi_c >= phi_low && phi_c <= phi_high);
+            if (centre_in_z_slice) {
+                Vector3D eps_c =
+                  pixel_to_kabsch(s0, s1_c, phi_c, s_pixel, phi_c, rot_axis, s1_len);
+                // eps_c.z should be ~0; check only ε₁²/δ_B² + ε₂²/δ_B² ≤ 1
+                s_fg[2][ty][tx] = is_foreground(eps_c, delta_b, delta_m);
+            } else {
+                s_fg[2][ty][tx] = false;
+            }
+        } else {
+            s_fg[0][ty][tx] = false;
+            s_fg[1][ty][tx] = false;
+            s_fg[2][ty][tx] = false;
+        }
+
+        __syncthreads();
+
+        // Interior threads check all 12 voxel corners (8 original + 4 at phi_c)
+        if (pixel_in_image) {
+            bool any_fg = false;
+#pragma unroll
+            for (int z = 0; z < 3; ++z) {
+                // if any corner at this z-slice is foreground, the pixel is foreground
+                any_fg |= s_fg[z][ty][tx] | s_fg[z][ty][tx + 1] | s_fg[z][ty + 1][tx]
+                          | s_fg[z][ty + 1][tx + 1];
+            }
+
+            // Only accumulate if the pixel centre is within the bbox
+            // (pixels outside the bbox are irrelevant to this reflection)
+            const bool px_in_bbox = (gx >= bbox.x_min && gx < bbox.x_max)
+                                    && (gy >= bbox.y_min && gy < bbox.y_max);
+
+            if (px_in_bbox) {
+                const accumulator_t pixel_value =
+                  static_cast<accumulator_t>(image(gx, gy));
+
+                if (any_fg) {
+                    atomicAdd(&d_foreground_sum[refl_idx], pixel_value);
+                    atomicAdd(&d_foreground_count[refl_idx], 1u);
+                } else {
+                    atomicAdd(&d_background_sum[refl_idx], pixel_value);
+                    atomicAdd(&d_background_count[refl_idx], 1u);
+                }
+            }
+        }
+
+        __syncthreads();  // Barrier before next reflection overwrites s_fg
+    }
+}
+
+/**
+ * @brief Host wrapper function for image-based Kabsch computation
+ *
+ * Launches the 8-corner Kabsch kernel using overlapping tiles.  Each block
+ * of KABSCH_BLOCK_X × KABSCH_BLOCK_Y threads covers that many corners,
+ * processing (KABSCH_BLOCK_X-1) × (KABSCH_BLOCK_Y-1) complete pixels.
+ */
+void compute_kabsch_transform(pixel_t *d_image,
+                              size_t image_pitch,
+                              uint32_t width,
+                              uint32_t height,
+                              int image_num,
+                              const scalar_t *d_matrix,
+                              scalar_t wavelength,
+                              scalar_t osc_start,
+                              scalar_t osc_width,
+                              int image_range_start,
+                              Vector3D s0,
+                              Vector3D rot_axis,
+                              const Vector3D *d_s1_vectors,
+                              const scalar_t *d_phi_values,
+                              const BoundingBoxExtents *d_bboxes,
+                              const size_t *d_reflection_indices,
+                              size_t num_reflections_this_image,
+                              scalar_t delta_b,
+                              scalar_t delta_m,
+                              accumulator_t *d_foreground_sum,
+                              uint32_t *d_foreground_count,
+                              accumulator_t *d_background_sum,
+                              uint32_t *d_background_count,
+                              cudaStream_t stream) {
+    // Configure kernel launch parameters.
+    // Every block is always the full 32×16 = 512 threads.
+    dim3 blockDim(KABSCH_BLOCK_X, KABSCH_BLOCK_Y);
+
+    // Each block processes (BLOCK-1) complete pixels per dimension:
+    //   31 pixels in x, 15 pixels in y.
+    // So the number of blocks needed is ceil(image_pixels / pixels_per_block):
+    //   gridDim.x = ceil(width  / 31)
+    //   gridDim.y = ceil(height / 15)
+    // Edge blocks may extend past the image boundary — those threads
+    // simply fail the bounds check in the kernel and write false / skip.
+    dim3 gridDim(ceil_div(width, static_cast<uint32_t>(KABSCH_BLOCK_X - 1)),
+                 ceil_div(height, static_cast<uint32_t>(KABSCH_BLOCK_Y - 1)));
+
+    // Launch the summation integration kernel
+    kabsch_transform<<<gridDim, blockDim, 0, stream>>>(d_image,
+                                                       image_pitch,
+                                                       width,
+                                                       height,
+                                                       image_num,
+                                                       d_matrix,
+                                                       wavelength,
+                                                       osc_start,
+                                                       osc_width,
+                                                       image_range_start,
+                                                       s0,
+                                                       rot_axis,
+                                                       d_s1_vectors,
+                                                       d_phi_values,
+                                                       d_bboxes,
+                                                       d_reflection_indices,
+                                                       num_reflections_this_image,
+                                                       delta_b,
+                                                       delta_m,
+                                                       d_foreground_sum,
+                                                       d_foreground_count,
+                                                       d_background_sum,
+                                                       d_background_count);
+
+    // Check for kernel launch errors
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+          fmt::format("Kernel launch failed: {}", cudaGetErrorString(err)));
+    }
+}
+
+// DEPRECATED
+/**
+ * @brief CUDA kernel to compute pixel-to-Kabsch transformations for
+ * multiple voxels
+ * 
+ * This kernel transforms voxel coordinates from reciprocal space to
+ * Kabsch coordinates.
+ * Each thread processes one voxel, computing its Kabsch coordinates
+ * relative to a reflection center.
+ * 
+ * @param s_pixels Array of s_pixel vectors (different for each voxel)
+ * @param phi_pixels Array of phi_pixel angles (different for each voxel)
+ * @param s1_c Reflection center s1 vector (same for all voxels in this batch)
+ * @param phi_c Reflection center phi angle (same for all voxels in this batch)
+ * @param s0 Initial scattering vector
+ * @param rot_axis Rotation axis vector
+ * @param eps_array Output array for Kabsch coordinates
+ * @param s1_len_array Output array for s1 lengths
+ * @param n Number of voxels to process
+ */
+__global__ void kabsch_transform_flat(const Vector3D *__restrict__ s_pixels,
+                                      const scalar_t *__restrict__ phi_pixels,
+                                      Vector3D s1_c,
+                                      scalar_t phi_c,
+                                      Vector3D s0,
+                                      Vector3D rot_axis,
+                                      Vector3D *eps_array,
+                                      scalar_t *s1_len_array,
+                                      size_t n) {
+    // Calculate global thread index
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;  // Bounds check
+
+    // Get input data for this voxel
+    Vector3D s_pixel = s_pixels[i];
+    scalar_t phi_pixel = phi_pixels[i];
+
+    // Compute Kabsch transformation using the device function
+    scalar_t s1_len;
+    Vector3D eps =
+      pixel_to_kabsch(s0, s1_c, phi_c, s_pixel, phi_pixel, rot_axis, s1_len);
+
+    // Store results
+    eps_array[i] = eps;
+    s1_len_array[i] = s1_len;
+}
+
+/**
+ * @brief Host wrapper function for voxel Kabsch computation
+ * 
+ * @param h_s_pixels Host array of s_pixel vectors (different for each voxel)
+ * @param h_phi_pixels Host array of phi_pixel angles (different for each voxel)
+ * @param s1_c Reflection center s1 vector (same for all voxels in this reflection)
+ * @param phi_c Reflection center phi angle (same for all voxels in this reflection)
+ * @param s0 Initial scattering vector
+ * @param rot_axis Rotation axis vector
+ * @param h_eps Host array to store output Kabsch coordinates
+ * @param h_s1_len Host array to store output s1 lengths
+ * @param n Number of voxels
+ */
+void compute_kabsch_transform(const Vector3D *h_s_pixels,
+                              const scalar_t *h_phi_pixels,
+                              Vector3D s1_c,
+                              scalar_t phi_c,
+                              Vector3D s0,
+                              Vector3D rot_axis,
+                              Vector3D *h_eps,
+                              scalar_t *h_s1_len,
+                              size_t n) {
+    // Create RAII device buffers
+    DeviceBuffer<Vector3D> d_s_pixels(n);    // Input s_pixel vectors
+    DeviceBuffer<scalar_t> d_phi_pixels(n);  // Input phi_pixel angles
+    DeviceBuffer<Vector3D> d_eps(n);         // Output Kabsch coordinates
+    DeviceBuffer<scalar_t> d_s1_len(n);      // Output s1 lengths
+
+    // Copy input data from host to device
+    d_s_pixels.assign(h_s_pixels);
+    d_phi_pixels.assign(h_phi_pixels);
+
+    // Configure kernel launch parameters
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
+
+    // Launch kernel
+    kabsch_transform_flat<<<blocksPerGrid, threadsPerBlock>>>(d_s_pixels.data(),
+                                                              d_phi_pixels.data(),
+                                                              s1_c,
+                                                              phi_c,
+                                                              s0,
+                                                              rot_axis,
+                                                              d_eps.data(),
+                                                              d_s1_len.data(),
+                                                              n);
+
+    // Check for kernel launch errors
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+          fmt::format("Kernel launch failed: {}", cudaGetErrorString(err)));
+    }
+
+    // Wait for kernel completion
+    err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+          fmt::format("Kernel execution failed: {}", cudaGetErrorString(err)));
+    }
+
+    // Copy results back to host
+    d_eps.extract(h_eps);
+    d_s1_len.extract(h_s1_len);
+}
\ No newline at end of file
diff --git a/integrator/kabsch.cuh b/integrator/kabsch.cuh
new file mode 100644
index 00000000..3fa12f69
--- /dev/null
+++ b/integrator/kabsch.cuh
@@ -0,0 +1,113 @@
+/**
+ * @file kabsch.cuh
+ * @brief Header file for CUDA Kabsch coordinate transformation functions
+ * 
+ * This header declares the interface for GPU-accelerated coordinate transformations
+ * used in the Kabsch algorithm for data processing. The functions
+ * convert pixel coordinates from reciprocal space into a geometry-invariant local
+ * coordinate system for efficient reflection profile integration.
+ * 
+ * The Kabsch transformation creates a local coordinate system defined by:
+ * - e₁: Perpendicular to the scattering plane (s₁ᶜ × s₀)
+ * - e₂: Within the scattering plane, orthogonal to e₁ (s₁ᶜ × e₁)
+ * - e₃: Bisects the incident and diffracted beam directions (s₁ᶜ + s₀)
+ * 
+ * @author Dimitrios Vlachos
+ * @date 2025
+ * @see kabsch.cu for implementation details
+ * @see Vector3D for mathematical vector operations
+ * @see DeviceBuffer for GPU memory management
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+
+#include "extent.hpp"
+#include "h5read.h"
+#include "math/device_precision.cuh"
+#include "math/vector3d.cuh"
+
+/**
+ * @brief Host wrapper function for image-based Kabsch computation
+ * 
+ * Launches a kernel over the entire image grid to compute Kabsch coordinates
+ * for pixels that fall within reflection bounding boxes, classifying each
+ * pixel as foreground or background and atomically accumulating intensities
+ * for summation integration.
+ * 
+ * @param d_image Device pointer to image data (pitched allocation)
+ * @param image_pitch Pitch in bytes of the device image allocation
+ * @param width Image width in pixels
+ * @param height Image height in pixels
+ * @param image_num Current image/frame number
+ * @param d_matrix Detector matrix (3x3 flattened) for pixel to lab conversion
+ * @param wavelength Beam wavelength for s_pixel normalization
+ * @param osc_start Oscillation start angle (radians)
+ * @param osc_width Oscillation width per image (radians)
+ * @param image_range_start First image number in the scan
+ * @param s0 Incident beam vector
+ * @param rot_axis Rotation axis vector
+ * @param d_s1_vectors Device array of s1_c vectors for each reflection
+ * @param d_phi_values Device array of phi_c values for each reflection
+ * @param d_bboxes Device array of BoundingBoxExtents structs for each reflection
+ * @param d_reflection_indices Device array of reflection indices for this image
+ * @param num_reflections_this_image Number of reflections touching this image
+ * @param delta_b Foreground extent in e₁/e₂ directions (n_sigma × σ_D), radians
+ * @param delta_m Foreground extent in e₃ direction (n_sigma × σ_M), radians
+ * @param d_foreground_sum Device array to accumulate foreground intensities per reflection
+ * @param d_foreground_count Device array to count foreground pixels per reflection
+ * @param d_background_sum Device array to accumulate background intensities per reflection
+ * @param d_background_count Device array to count background pixels per reflection
+ * @param stream CUDA stream for async execution
+ */
+void compute_kabsch_transform(pixel_t *d_image,
+                              size_t image_pitch,
+                              uint32_t width,
+                              uint32_t height,
+                              int image_num,
+                              const scalar_t *d_matrix,
+                              scalar_t wavelength,
+                              scalar_t osc_start,
+                              scalar_t osc_width,
+                              int image_range_start,
+                              fastvec::Vector3D s0,
+                              fastvec::Vector3D rot_axis,
+                              const fastvec::Vector3D *d_s1_vectors,
+                              const scalar_t *d_phi_values,
+                              const BoundingBoxExtents *d_bboxes,
+                              const size_t *d_reflection_indices,
+                              size_t num_reflections_this_image,
+                              scalar_t delta_b,
+                              scalar_t delta_m,
+                              accumulator_t *d_foreground_sum,
+                              uint32_t *d_foreground_count,
+                              accumulator_t *d_background_sum,
+                              uint32_t *d_background_count,
+                              cudaStream_t stream);
+
+// DEPRECATED
+/**
+ * @brief Host wrapper function for voxel Kabsch computation
+ * 
+ * @param h_s_pixels Host array of s_pixel vectors (different for each voxel)
+ * @param h_phi_pixels Host array of phi_pixel angles (different for each voxel)
+ * @param s1_c Reflection center s1 vector (same for all voxels in this reflection)
+ * @param phi_c Reflection center phi angle (same for all voxels in this reflection)
+ * @param s0 Initial scattering vector
+ * @param rot_axis Rotation axis vector
+ * @param h_eps Host array to store output Kabsch coordinates
+ * @param h_s1_len Host array to store output s1 lengths
+ * @param n Number of voxels
+ */
+void compute_kabsch_transform(const fastvec::Vector3D *h_s_pixels,
+                              const scalar_t *h_phi_pixels,
+                              fastvec::Vector3D s1_c,
+                              scalar_t phi_c,
+                              fastvec::Vector3D s0,
+                              fastvec::Vector3D rot_axis,
+                              fastvec::Vector3D *h_eps,
+                              scalar_t *h_s1_len,
+                              size_t n);
\ No newline at end of file
diff --git a/integrator/tests/CMakeLists.txt b/integrator/tests/CMakeLists.txt
new file mode 100644
index 00000000..492ea5ed
--- /dev/null
+++ b/integrator/tests/CMakeLists.txt
@@ -0,0 +1,31 @@
+include(GoogleTest)
+
+add_executable(test_extent test_extent.cc ../extent.cc)
+target_include_directories(test_extent PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+target_link_libraries(test_extent PRIVATE 
+    ffs_common
+    dx2
+    GTest::gtest_main
+    Eigen3::Eigen
+    spdlog::spdlog
+    nlohmann_json::nlohmann_json
+)
+
+add_executable(test_kabsch test_kabsch.cc ../kabsch.cu)
+set_source_files_properties(../kabsch.cu PROPERTIES LANGUAGE CUDA)
+target_include_directories(test_kabsch PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+target_link_libraries(test_kabsch PRIVATE 
+    ffs_common
+    dx2
+    h5read
+    GTest::gtest_main
+    Eigen3::Eigen
+    spdlog::spdlog
+    nlohmann_json::nlohmann_json
+    CUDA::cudart
+    lodepng
+)
+set_property(TARGET test_kabsch PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+gtest_discover_tests(test_extent PROPERTIES LABELS "integrator-tests")
+gtest_discover_tests(test_kabsch PROPERTIES LABELS "integrator-tests")
\ No newline at end of file
diff --git a/integrator/tests/data/indexed.expt b/integrator/tests/data/indexed.expt
new file mode 100644
index 00000000..d35ba516
--- /dev/null
+++ b/integrator/tests/data/indexed.expt
@@ -0,0 +1,716 @@
+{
+  "__id__": "ExperimentList",
+  "experiment": [
+    {
+      "__id__": "Experiment",
+      "identifier": "215394b2-07ff-52c4-2596-1bc0ddcc36dd",
+      "beam": 0,
+      "detector": 0,
+      "goniometer": 0,
+      "scan": 0,
+      "crystal": 0,
+      "imageset": 0
+    }
+  ],
+  "history": [
+    "2025-11-27T16:32:24Z|dials.import|v3.25.dev0",
+    "2025-11-27T16:33:00Z|dials.index|v3.25.dev0"
+  ],
+  "imageset": [
+    {
+      "__id__": "ImageSequence",
+      "template": "/dls/science/groups/scisoft/DIALS/dials_data/thaumatin_i03_rotation/thau_2_1.nxs",
+      "single_file_indices": [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        26,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        60,
+        61,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        83,
+        84,
+        85,
+        86,
+        87,
+        88,
+        89,
+        90,
+        91,
+        92,
+        93,
+        94,
+        95,
+        96,
+        97,
+        98,
+        99
+      ],
+      "mask": null,
+      "gain": null,
+      "pedestal": null,
+      "dx": null,
+      "dy": null,
+      "params": {
+        "dynamic_shadowing": "Auto",
+        "multi_panel": false
+      }
+    }
+  ],
+  "beam": [
+    {
+      "__id__": "monochromatic",
+      "direction": [
+        -0.0034229667619783467,
+        -0.0,
+        0.999994141632113
+      ],
+      "wavelength": 0.9762458439949315,
+      "divergence": 0.0,
+      "sigma_divergence": 0.0,
+      "polarization_normal": [
+        0.0,
+        1.0,
+        0.0
+      ],
+      "polarization_fraction": 0.999,
+      "flux": 0.0,
+      "transmission": 1.0,
+      "probe": "x-ray",
+      "sample_to_source_distance": 0.0
+    }
+  ],
+  "detector": [
+    {
+      "panels": [
+        {
+          "name": "/entry/instrument/detector/module",
+          "type": "SENSOR_PAD",
+          "fast_axis": [
+            0.9999343565703017,
+            0.008972928699089156,
+            0.007125243918478585
+          ],
+          "slow_axis": [
+            0.009011312015482328,
+            -0.9999449607587793,
+            -0.005373240073119384
+          ],
+          "origin": [
+            -154.36992588196173,
+            164.3947680648576,
+            -198.15267891180412
+          ],
+          "raw_image_offset": [
+            0,
+            0
+          ],
+          "image_size": [
+            4148,
+            4362
+          ],
+          "pixel_size": [
+            0.075,
+            0.075
+          ],
+          "trusted_range": [
+            0.0,
+            46051.0
+          ],
+          "thickness": 0.45000000000000007,
+          "material": "Si",
+          "mu": 3.9219876752936167,
+          "identifier": "",
+          "mask": [],
+          "gain": 1.0,
+          "pedestal": 0.0,
+          "px_mm_strategy": {
+            "type": "ParallaxCorrectedPxMmStrategy"
+          }
+        }
+      ],
+      "hierarchy": {
+        "name": "",
+        "type": "",
+        "fast_axis": [
+          1.0,
+          0.0,
+          0.0
+        ],
+        "slow_axis": [
+          0.0,
+          1.0,
+          0.0
+        ],
+        "origin": [
+          0.0,
+          0.0,
+          0.0
+        ],
+        "raw_image_offset": [
+          0,
+          0
+        ],
+        "image_size": [
+          0,
+          0
+        ],
+        "pixel_size": [
+          0.0,
+          0.0
+        ],
+        "trusted_range": [
+          0.0,
+          0.0
+        ],
+        "thickness": 0.0,
+        "material": "",
+        "mu": 0.0,
+        "identifier": "",
+        "mask": [],
+        "gain": 1.0,
+        "pedestal": 0.0,
+        "px_mm_strategy": {
+          "type": "SimplePxMmStrategy"
+        },
+        "children": [
+          {
+            "panel": 0
+          }
+        ]
+      }
+    }
+  ],
+  "goniometer": [
+    {
+      "axes": [
+        [
+          1.0,
+          -0.0025,
+          0.0056
+        ],
+        [
+          -0.006,
+          -0.0264,
+          -0.9996
+        ],
+        [
+          1.0,
+          0.0,
+          0.0
+        ]
+      ],
+      "angles": [
+        0.0,
+        0.0,
+        0.0
+      ],
+      "names": [
+        "phi",
+        "chi",
+        "omega"
+      ],
+      "scan_axis": 2
+    }
+  ],
+  "scan": [
+    {
+      "image_range": [
+        1,
+        100
+      ],
+      "batch_offset": 0,
+      "properties": {
+        "epochs": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "exposure_time": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "oscillation": [
+          0.0,
+          0.09999999999999964,
+          0.1999999999999993,
+          0.29999999999999893,
+          0.3999999999999986,
+          0.4999999999999982,
+          0.5999999999999979,
+          0.6999999999999975,
+          0.7999999999999972,
+          0.8999999999999969,
+          0.9999999999999964,
+          1.0999999999999959,
+          1.1999999999999957,
+          1.2999999999999954,
+          1.399999999999995,
+          1.4999999999999944,
+          1.5999999999999943,
+          1.699999999999994,
+          1.7999999999999938,
+          1.899999999999993,
+          1.999999999999993,
+          2.0999999999999925,
+          2.1999999999999917,
+          2.299999999999992,
+          2.3999999999999915,
+          2.499999999999991,
+          2.5999999999999908,
+          2.6999999999999904,
+          2.79999999999999,
+          2.8999999999999897,
+          2.999999999999989,
+          3.099999999999989,
+          3.1999999999999886,
+          3.2999999999999883,
+          3.399999999999988,
+          3.4999999999999876,
+          3.5999999999999877,
+          3.699999999999987,
+          3.799999999999986,
+          3.899999999999986,
+          3.999999999999986,
+          4.099999999999985,
+          4.199999999999985,
+          4.299999999999985,
+          4.3999999999999835,
+          4.499999999999984,
+          4.599999999999984,
+          4.699999999999983,
+          4.799999999999983,
+          4.899999999999983,
+          4.999999999999982,
+          5.099999999999982,
+          5.1999999999999815,
+          5.299999999999981,
+          5.399999999999981,
+          5.4999999999999805,
+          5.59999999999998,
+          5.69999999999998,
+          5.799999999999979,
+          5.899999999999979,
+          5.999999999999978,
+          6.099999999999978,
+          6.199999999999978,
+          6.299999999999978,
+          6.399999999999977,
+          6.499999999999976,
+          6.5999999999999766,
+          6.699999999999976,
+          6.799999999999976,
+          6.8999999999999755,
+          6.999999999999975,
+          7.099999999999975,
+          7.199999999999975,
+          7.299999999999973,
+          7.399999999999974,
+          7.499999999999974,
+          7.599999999999972,
+          7.699999999999973,
+          7.799999999999972,
+          7.899999999999971,
+          7.999999999999972,
+          8.099999999999971,
+          8.19999999999997,
+          8.29999999999997,
+          8.39999999999997,
+          8.499999999999968,
+          8.59999999999997,
+          8.699999999999969,
+          8.799999999999967,
+          8.899999999999968,
+          8.999999999999968,
+          9.099999999999968,
+          9.199999999999967,
+          9.299999999999967,
+          9.399999999999967,
+          9.499999999999966,
+          9.599999999999966,
+          9.699999999999966,
+          9.799999999999965,
+          9.899999999999965
+        ]
+      },
+      "valid_image_ranges": {}
+    }
+  ],
+  "crystal": [
+    {
+      "__id__": "crystal",
+      "real_space_a": [
+        -18.3617922001806,
+        -1.3985262449809124,
+        -54.866705726818
+      ],
+      "real_space_b": [
+        -32.09880690850544,
+        47.2184015325075,
+        9.47764074028691
+      ],
+      "real_space_c": [
+        115.77476247140888,
+        86.89078795711316,
+        -40.81230881268615
+      ],
+      "space_group_hall_symbol": " P 1",
+      "B_covariance": [
+        1.4129660242383127e-11,
+        -9.629396138302801e-28,
+        3.5689987328237737e-28,
+        -2.4872540268126837e-12,
+        7.280944537835667e-12,
+        3.5689987328237737e-28,
+        3.646488633397222e-12,
+        -5.820541202322426e-13,
+        2.914308627849801e-12,
+        -9.629396138302803e-28,
+        8.726639160350873e-44,
+        -2.778693699413534e-44,
+        3.40351166637644e-28,
+        -5.955194306558846e-28,
+        -2.778693699413534e-44,
+        -3.2261385665732015e-28,
+        1.1706473372740525e-28,
+        -2.2689755947169165e-28,
+        3.5689987328237737e-28,
+        -2.778693699413534e-44,
+        1.0321997888456188e-44,
+        -7.002309989901791e-29,
+        2.013272961354654e-28,
+        1.0321997888456188e-44,
+        1.2902506870731577e-28,
+        -2.5570263336226253e-29,
+        8.42855090597776e-29,
+        -2.4872540268126837e-12,
+        3.4035116663764394e-28,
+        -7.002309989901791e-29,
+        3.0011386068406224e-12,
+        -2.0716802701697863e-12,
+        -7.002309989901791e-29,
+        -3.140331758403637e-13,
+        7.074979130795366e-13,
+        -5.717820023517858e-13,
+        7.280944537835668e-12,
+        -5.955194306558845e-28,
+        2.0132729613546538e-28,
+        -2.0716802701697863e-12,
+        4.324985322609059e-12,
+        2.0132729613546538e-28,
+        1.9954853137207196e-12,
+        -5.377998581508811e-13,
+        1.6439621307599652e-12,
+        3.5689987328237737e-28,
+        -2.778693699413534e-44,
+        1.0321997888456188e-44,
+        -7.002309989901791e-29,
+        2.013272961354654e-28,
+        1.0321997888456188e-44,
+        1.2902506870731577e-28,
+        -2.5570263336226253e-29,
+        8.42855090597776e-29,
+        3.6464886333972215e-12,
+        -3.226138566573202e-28,
+        1.2902506870731577e-28,
+        -3.140331758403637e-13,
+        1.9954853137207196e-12,
+        1.2902506870731577e-28,
+        3.3244312330737377e-12,
+        -6.388567421433584e-13,
+        1.0535696398108259e-12,
+        -5.820541202322426e-13,
+        1.1706473372740525e-28,
+        -2.5570263336226253e-29,
+        7.074979130795364e-13,
+        -5.377998581508811e-13,
+        -2.5570263336226253e-29,
+        -6.388567421433584e-13,
+        4.1810623500928776e-13,
+        -2.0879704543407345e-13,
+        2.9143086278498007e-12,
+        -2.2689755947169165e-28,
+        8.42855090597776e-29,
+        -5.717820023517859e-13,
+        1.6439621307599652e-12,
+        8.42855090597776e-29,
+        1.0535696398108257e-12,
+        -2.0879704543407345e-13,
+        6.882434112305724e-13
+      ]
+    }
+  ],
+  "profile": [],
+  "scaling_model": []
+}
\ No newline at end of file
diff --git a/integrator/tests/data/predicted.refl b/integrator/tests/data/predicted.refl
new file mode 100644
index 00000000..9fc1e414
Binary files /dev/null and b/integrator/tests/data/predicted.refl differ
diff --git a/integrator/tests/data/simple_integrated.refl b/integrator/tests/data/simple_integrated.refl
new file mode 100644
index 00000000..8ae4d3e4
Binary files /dev/null and b/integrator/tests/data/simple_integrated.refl differ
diff --git a/integrator/tests/test_extent.cc b/integrator/tests/test_extent.cc
new file mode 100644
index 00000000..7d2cbffa
--- /dev/null
+++ b/integrator/tests/test_extent.cc
@@ -0,0 +1,402 @@
+#include <gtest/gtest.h>
+
+#include <Eigen/Dense>
+#include <dx2/beam.hpp>
+#include <dx2/experiment.hpp>
+#include <dx2/goniometer.hpp>
+#include <dx2/reflection.hpp>
+#include <filesystem>
+#include <fstream>
+#include <nlohmann/json.hpp>
+#include <vector>
+
+#include "extent.hpp"
+#include "ffs_logger.hpp"
+#include "math/math_utils.cuh"
+
+/**
+ * @brief Validate computed bounding boxes against existing bbox column
+ * 
+ * Compares the first N bounding boxes and writes results to a validation file.
+ * 
+ * @param reflections The reflection table containing existing bbox column
+ * @param computed_bbox_data Flat array of computed bounding boxes (6 values per reflection)
+ * @param num_reflections Total number of reflections
+ * @param num_to_display Number of bounding boxes to display for comparison
+ * @return 0 on success, 1 on error
+ */
+int validate_bounding_boxes(ReflectionTable &reflections,
+                            const std::vector<double> &computed_bbox_data,
+                            size_t num_reflections,
+                            size_t num_to_display = 10,
+                            const std::string &data_path = "") {
+    logger.info(
+      "Validation mode: comparing computed bounding boxes with existing bbox column");
+
+    // Load existing bounding boxes
+    auto bbox_column_opt = reflections.column<int>(data_path + "bbox");
+    if (!bbox_column_opt) {
+        logger.error("Column 'bbox' not found in reflection data for validation.");
+        return 1;
+    }
+    auto bbox_column = *bbox_column_opt;
+
+    // Statistics for comparison
+    double total_x_min_diff = 0.0, total_x_max_diff = 0.0;
+    double total_y_min_diff = 0.0, total_y_max_diff = 0.0;
+    double total_z_min_diff = 0.0, total_z_max_diff = 0.0;
+    double max_x_min_diff = 0.0, max_x_max_diff = 0.0;
+    double max_y_min_diff = 0.0, max_y_max_diff = 0.0;
+    double max_z_min_diff = 0.0, max_z_max_diff = 0.0;
+
+    logger.info("First {} bounding box comparisons:", num_to_display);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        // Existing bbox
+        double ex_x_min = bbox_column(i, 0);
+        double ex_x_max = bbox_column(i, 1);
+        double ex_y_min = bbox_column(i, 2);
+        double ex_y_max = bbox_column(i, 3);
+        int ex_z_min = static_cast<int>(bbox_column(i, 4));
+        int ex_z_max = static_cast<int>(bbox_column(i, 5));
+
+        // Computed bbox (from flat array)
+        double comp_x_min = computed_bbox_data[i * 6 + 0];
+        double comp_x_max = computed_bbox_data[i * 6 + 1];
+        double comp_y_min = computed_bbox_data[i * 6 + 2];
+        double comp_y_max = computed_bbox_data[i * 6 + 3];
+        int comp_z_min = static_cast<int>(computed_bbox_data[i * 6 + 4]);
+        int comp_z_max = static_cast<int>(computed_bbox_data[i * 6 + 5]);
+
+        // Compute differences
+        double diff_x_min = std::abs(comp_x_min - ex_x_min);
+        double diff_x_max = std::abs(comp_x_max - ex_x_max);
+        double diff_y_min = std::abs(comp_y_min - ex_y_min);
+        double diff_y_max = std::abs(comp_y_max - ex_y_max);
+        double diff_z_min = std::abs(comp_z_min - ex_z_min);
+        double diff_z_max = std::abs(comp_z_max - ex_z_max);
+
+        // Accumulate statistics
+        total_x_min_diff += diff_x_min;
+        total_x_max_diff += diff_x_max;
+        total_y_min_diff += diff_y_min;
+        total_y_max_diff += diff_y_max;
+        total_z_min_diff += diff_z_min;
+        total_z_max_diff += diff_z_max;
+
+        max_x_min_diff = std::max(max_x_min_diff, diff_x_min);
+        max_x_max_diff = std::max(max_x_max_diff, diff_x_max);
+        max_y_min_diff = std::max(max_y_min_diff, diff_y_min);
+        max_y_max_diff = std::max(max_y_max_diff, diff_y_max);
+        max_z_min_diff = std::max(max_z_min_diff, diff_z_min);
+        max_z_max_diff = std::max(max_z_max_diff, diff_z_max);
+
+        // Display first N comparisons
+        if (i < num_to_display) {
+            logger.info(
+              "bbox[{}]: existing x=[{:.1f},{:.1f}] y=[{:.1f},{:.1f}] z=[{},{}]",
+              i,
+              ex_x_min,
+              ex_x_max,
+              ex_y_min,
+              ex_y_max,
+              ex_z_min,
+              ex_z_max);
+            logger.info(
+              "bbox[{}]: computed x=[{:.1f},{:.1f}] y=[{:.1f},{:.1f}] z=[{},{}]",
+              i,
+              comp_x_min,
+              comp_x_max,
+              comp_y_min,
+              comp_y_max,
+              comp_z_min,
+              comp_z_max);
+            logger.info(
+              "bbox[{}]: diff     x=[{:.1f},{:.1f}] y=[{:.1f},{:.1f}] "
+              "z=[{:.1f},{:.1f}]",
+              i,
+              diff_x_min,
+              diff_x_max,
+              diff_y_min,
+              diff_y_max,
+              diff_z_min,
+              diff_z_max);
+        }
+    }
+
+    // Compute and display average differences
+    double n = static_cast<double>(num_reflections);
+    logger.info("\n=== Bounding Box Comparison Statistics ===");
+    logger.info("Mean absolute differences:");
+    logger.info("  x_min: {:.3f} pixels, x_max: {:.3f} pixels",
+                total_x_min_diff / n,
+                total_x_max_diff / n);
+    logger.info("  y_min: {:.3f} pixels, y_max: {:.3f} pixels",
+                total_y_min_diff / n,
+                total_y_max_diff / n);
+    logger.info("  z_min: {:.3f} frames,  z_max: {:.3f} frames",
+                total_z_min_diff / n,
+                total_z_max_diff / n);
+    logger.info("Maximum absolute differences:");
+    logger.info(
+      "  x_min: {:.3f} pixels, x_max: {:.3f} pixels", max_x_min_diff, max_x_max_diff);
+    logger.info(
+      "  y_min: {:.3f} pixels, y_max: {:.3f} pixels", max_y_min_diff, max_y_max_diff);
+    logger.info(
+      "  z_min: {:.3f} frames,  z_max: {:.3f} frames", max_z_min_diff, max_z_max_diff);
+
+    // Add computed bounding boxes to reflection table for comparison
+    reflections.add_column(
+      "computed_bbox", std::vector<size_t>{num_reflections, 6}, computed_bbox_data);
+
+    // Write output file with comparison
+    reflections.write("validation_reflections.h5");
+    logger.info("Validation results saved to validation_reflections.h5");
+
+    return 0;
+}
+
+// Test fixture for integrator tests with file paths
+class ExtentValidationTest : public ::testing::Test {
+  protected:
+    void SetUp() override {
+        // Get paths relative to test directory
+        test_dir = std::filesystem::path(__FILE__).parent_path();
+        predicted_refl = test_dir / "data" / "predicted.refl";
+        indexed_expt = test_dir / "data" / "indexed.expt";
+        integrated_truth_refl = test_dir / "data" / "simple_integrated.refl";
+    }
+
+    std::filesystem::path test_dir;
+    std::filesystem::path predicted_refl;
+    std::filesystem::path indexed_expt;
+    std::filesystem::path integrated_truth_refl;
+};
+
+TEST_F(ExtentValidationTest, ComputeKabschBoundingBoxes) {
+    // Check if test files exist
+    ASSERT_TRUE(std::filesystem::exists(predicted_refl))
+      << "Reflection file not found: " << predicted_refl;
+    ASSERT_TRUE(std::filesystem::exists(indexed_expt))
+      << "Experiment file not found: " << indexed_expt;
+    ASSERT_TRUE(std::filesystem::exists(integrated_truth_refl))
+      << "Integrated reflection file not found: " << integrated_truth_refl;
+
+    // Load source reflection data (indexed reflections for computing)
+    logger.info("Loading indexed data from file: {}", predicted_refl.string());
+    ReflectionTable reflections(predicted_refl.string());
+
+    // Load integrated reflections for comparison (contains bbox)
+    logger.info("Loading integrated data for comparison from: {}",
+                integrated_truth_refl.string());
+    ReflectionTable integrated_reflections(integrated_truth_refl.string());
+
+    // Log available columns in integrated reflections
+    // logger.info("Available columns in integrated reflections:");
+    // for (const auto &name : integrated_reflections.get_column_names()) {
+    //     logger.info("  - {}", name);
+    // }
+
+    // HDF5 data path prefixes for reflection columns
+    const std::string indexed_data_path = "";
+    const std::string integrated_data_path = "";
+
+    // Extract required columns from indexed reflections
+    auto s1_vectors_opt = reflections.column<double>(indexed_data_path + "s1");
+    ASSERT_TRUE(s1_vectors_opt.has_value())
+      << "Column 's1' not found in reflection data.";
+    auto s1_vectors = *s1_vectors_opt;
+
+    auto phi_column_opt = reflections.column<double>(indexed_data_path + "xyzcal.mm");
+    ASSERT_TRUE(phi_column_opt.has_value())
+      << "Column 'xyzcal.mm' not found for phi positions.";
+    auto phi_column = *phi_column_opt;
+
+    // Extract bbox from integrated reflections for comparison
+    auto bbox_column_opt =
+      integrated_reflections.column<int>(integrated_data_path + "bbox");
+    ASSERT_TRUE(bbox_column_opt.has_value())
+      << "Column 'bbox' not found in integrated reflection data.";
+    auto bbox_column = *bbox_column_opt;
+
+    size_t num_reflections = s1_vectors.extent(0);
+    logger.info("Processing {} reflections", num_reflections);
+
+    // Parse experiment list from JSON
+    std::ifstream f(indexed_expt);
+    nlohmann::json elist_json_obj;
+    try {
+        elist_json_obj = nlohmann::json::parse(f);
+    } catch (nlohmann::json::parse_error &ex) {
+        FAIL() << "Failed to parse experiment file: " << ex.what();
+    }
+
+    // Construct Experiment object
+    Experiment<MonochromaticBeam> expt;
+    try {
+        expt = Experiment<MonochromaticBeam>(elist_json_obj);
+    } catch (const std::invalid_argument &ex) {
+        FAIL() << "Failed to construct Experiment: " << ex.what();
+    }
+
+    // Extract experimental components
+    MonochromaticBeam beam = expt.beam();
+    Goniometer gonio = expt.goniometer();
+    const Panel &panel = expt.detector().panels()[0];
+    const Scan &scan = expt.scan();
+
+    Eigen::Vector3d s0 = beam.get_s0();
+    Eigen::Vector3d rotation_axis = gonio.get_rotation_axis();
+
+    // Use baseline values for sigma_b and sigma_m
+    double sigma_b = degrees_to_radians(0.01);
+    double sigma_m = degrees_to_radians(0.01);
+
+    // Compute bounding boxes using CPU-based Kabsch coordinate system
+    logger.info("Computing Kabsch bounding boxes for {} reflections", num_reflections);
+
+    std::vector<BoundingBoxExtents> computed_bboxes =
+      compute_kabsch_bounding_boxes(s0,
+                                    rotation_axis,
+                                    s1_vectors,
+                                    phi_column,
+                                    num_reflections,
+                                    sigma_b,
+                                    sigma_m,
+                                    panel,
+                                    scan,
+                                    beam);
+
+    ASSERT_EQ(computed_bboxes.size(), num_reflections)
+      << "Computed bboxes size mismatch";
+
+    // Convert BoundingBoxExtents to flat array format
+    std::vector<double> computed_bbox_data(num_reflections * 6);
+    for (size_t i = 0; i < num_reflections; ++i) {
+        computed_bbox_data[i * 6 + 0] = computed_bboxes[i].x_min;
+        computed_bbox_data[i * 6 + 1] = computed_bboxes[i].x_max;
+        computed_bbox_data[i * 6 + 2] = computed_bboxes[i].y_min;
+        computed_bbox_data[i * 6 + 3] = computed_bboxes[i].y_max;
+        computed_bbox_data[i * 6 + 4] = static_cast<double>(computed_bboxes[i].z_min);
+        computed_bbox_data[i * 6 + 5] = static_cast<double>(computed_bboxes[i].z_max);
+    }
+
+    // Validate against existing bounding boxes from integrated reflections
+    int result = validate_bounding_boxes(integrated_reflections,
+                                         computed_bbox_data,
+                                         num_reflections,
+                                         10,
+                                         integrated_data_path);
+    EXPECT_EQ(result, 0) << "Validation failed";
+
+    // Check for verbose mode via environment variable
+    bool verbose = std::getenv("VERBOSE_TEST") != nullptr;
+
+    // Track mismatches for summary
+    size_t total_mismatches = 0;
+    size_t x_min_mismatches = 0, x_max_mismatches = 0;
+    size_t y_min_mismatches = 0, y_max_mismatches = 0;
+    size_t z_min_mismatches = 0, z_max_mismatches = 0;
+
+    // Compare computed bounding boxes with existing ones - fail if any differences
+    for (size_t i = 0; i < num_reflections; ++i) {
+        const auto &bbox = computed_bboxes[i];
+
+        // Check that bounding boxes are reasonable
+        EXPECT_LE(bbox.x_min, bbox.x_max) << "Invalid x bounds for reflection " << i;
+        EXPECT_LE(bbox.y_min, bbox.y_max) << "Invalid y bounds for reflection " << i;
+        EXPECT_LE(bbox.z_min, bbox.z_max) << "Invalid z bounds for reflection " << i;
+
+        // Check that bounding boxes have non-zero size
+        EXPECT_GT(bbox.x_max - bbox.x_min, 0) << "Zero width x for reflection " << i;
+        EXPECT_GT(bbox.y_max - bbox.y_min, 0) << "Zero width y for reflection " << i;
+        EXPECT_GT(bbox.z_max - bbox.z_min, 0) << "Zero width z for reflection " << i;
+
+        // Compare with existing bbox - fail on any difference
+        double ex_x_min = bbox_column(i, 0);
+        double ex_x_max = bbox_column(i, 1);
+        double ex_y_min = bbox_column(i, 2);
+        double ex_y_max = bbox_column(i, 3);
+        int ex_z_min = static_cast<int>(bbox_column(i, 4));
+        int ex_z_max = static_cast<int>(bbox_column(i, 5));
+
+        bool has_mismatch = false;
+
+        if (bbox.x_min != ex_x_min) {
+            x_min_mismatches++;
+            has_mismatch = true;
+            if (verbose) {
+                EXPECT_EQ(bbox.x_min, ex_x_min)
+                  << "x_min mismatch for reflection " << i;
+            }
+        }
+        if (bbox.x_max != ex_x_max) {
+            x_max_mismatches++;
+            has_mismatch = true;
+            if (verbose) {
+                EXPECT_EQ(bbox.x_max, ex_x_max)
+                  << "x_max mismatch for reflection " << i;
+            }
+        }
+        if (bbox.y_min != ex_y_min) {
+            y_min_mismatches++;
+            has_mismatch = true;
+            if (verbose) {
+                EXPECT_EQ(bbox.y_min, ex_y_min)
+                  << "y_min mismatch for reflection " << i;
+            }
+        }
+        if (bbox.y_max != ex_y_max) {
+            y_max_mismatches++;
+            has_mismatch = true;
+            if (verbose) {
+                EXPECT_EQ(bbox.y_max, ex_y_max)
+                  << "y_max mismatch for reflection " << i;
+            }
+        }
+        if (bbox.z_min != ex_z_min) {
+            z_min_mismatches++;
+            has_mismatch = true;
+            if (verbose) {
+                EXPECT_EQ(bbox.z_min, ex_z_min)
+                  << "z_min mismatch for reflection " << i;
+            }
+        }
+        if (bbox.z_max != ex_z_max) {
+            z_max_mismatches++;
+            has_mismatch = true;
+            if (verbose) {
+                EXPECT_EQ(bbox.z_max, ex_z_max)
+                  << "z_max mismatch for reflection " << i;
+            }
+        }
+
+        if (has_mismatch) {
+            total_mismatches++;
+        }
+    }
+
+    // Print summary
+    logger.info("\n=== Bounding Box Comparison Summary ===");
+    logger.info("Total reflections checked: {}", num_reflections);
+    logger.info("Reflections with mismatches: {}", total_mismatches);
+    if (total_mismatches > 0) {
+        logger.info("Mismatches by component:");
+        logger.info("  x_min: {} mismatches", x_min_mismatches);
+        logger.info("  x_max: {} mismatches", x_max_mismatches);
+        logger.info("  y_min: {} mismatches", y_min_mismatches);
+        logger.info("  y_max: {} mismatches", y_max_mismatches);
+        logger.info("  z_min: {} mismatches", z_min_mismatches);
+        logger.info("  z_max: {} mismatches", z_max_mismatches);
+        logger.info("\nSet VERBOSE_TEST=1 to see detailed mismatch output");
+    }
+
+    // Fail test if any mismatches found
+    EXPECT_EQ(total_mismatches, 0)
+      << "Found " << total_mismatches << " reflections with bounding box mismatches";
+
+    logger.info("Validation complete for {} reflections", num_reflections);
+}
+
+TEST(IntegratorTest, BasicTest) {
+    EXPECT_TRUE(true);
+}
diff --git a/integrator/tests/test_kabsch.cc b/integrator/tests/test_kabsch.cc
new file mode 100644
index 00000000..cf112f60
--- /dev/null
+++ b/integrator/tests/test_kabsch.cc
@@ -0,0 +1,189 @@
+#include <gtest/gtest.h>
+
+#include <Eigen/Dense>
+#include <dx2/beam.hpp>
+#include <dx2/experiment.hpp>
+#include <dx2/goniometer.hpp>
+#include <dx2/reflection.hpp>
+#include <filesystem>
+#include <fstream>
+#include <nlohmann/json.hpp>
+#include <vector>
+
+#include "ffs_logger.hpp"
+#include "kabsch.cuh"
+#include "math/vector3d.cuh"
+
+// Test fixture for Kabsch tests with file paths
+class KabschTest : public ::testing::Test {
+  protected:
+    void SetUp() override {
+        // Get paths relative to test directory
+        test_dir = std::filesystem::path(__FILE__).parent_path();
+        predicted_refl = test_dir / "data" / "predicted.refl";
+        indexed_expt = test_dir / "data" / "indexed.expt";
+    }
+
+    std::filesystem::path test_dir;
+    std::filesystem::path predicted_refl;
+    std::filesystem::path indexed_expt;
+};
+
+TEST_F(KabschTest, ComputeKabschTransform) {
+    // Check if test files exist
+    ASSERT_TRUE(std::filesystem::exists(predicted_refl))
+      << "Reflection file not found: " << predicted_refl;
+    ASSERT_TRUE(std::filesystem::exists(indexed_expt))
+      << "Experiment file not found: " << indexed_expt;
+
+    // Load reflection data
+    logger.info("Loading reflection data from file: {}", predicted_refl.string());
+    ReflectionTable reflections(predicted_refl.string());
+
+    const std::string data_path = "";
+
+    // Extract required columns
+    auto s1_vectors_opt = reflections.column<double>(data_path + "s1");
+    ASSERT_TRUE(s1_vectors_opt.has_value())
+      << "Column 's1' not found in reflection data.";
+    auto s1_vectors = *s1_vectors_opt;
+
+    auto phi_column_opt = reflections.column<double>(data_path + "xyzcal.mm");
+    ASSERT_TRUE(phi_column_opt.has_value())
+      << "Column 'xyzcal.mm' not found for phi positions.";
+    auto phi_column = *phi_column_opt;
+
+    size_t num_reflections = s1_vectors.extent(0);
+    logger.info("Processing {} reflections for Kabsch transformation", num_reflections);
+
+    // Parse experiment file
+    std::ifstream f(indexed_expt);
+    nlohmann::json elist_json_obj;
+    try {
+        elist_json_obj = nlohmann::json::parse(f);
+    } catch (nlohmann::json::parse_error &ex) {
+        FAIL() << "Failed to parse experiment file: " << ex.what();
+    }
+
+    // Construct Experiment object
+    Experiment<MonochromaticBeam> expt;
+    try {
+        expt = Experiment<MonochromaticBeam>(elist_json_obj);
+    } catch (const std::invalid_argument &ex) {
+        FAIL() << "Failed to construct Experiment: " << ex.what();
+    }
+
+    // Extract experimental components
+    MonochromaticBeam beam = expt.beam();
+    Goniometer gonio = expt.goniometer();
+    const Panel &panel = expt.detector().panels()[0];
+    const Scan &scan = expt.scan();
+
+    Eigen::Vector3d s0_eigen = beam.get_s0();
+    Eigen::Vector3d rotation_axis_eigen = gonio.get_rotation_axis();
+    double wl = beam.get_wavelength();
+    double osc_start = scan.get_oscillation()[0];
+    double osc_width = scan.get_oscillation()[1];
+    int image_range_start = scan.get_image_range()[0];
+
+    // Convert to CUDA format
+    fastvec::Vector3D s0_cuda =
+      fastvec::make_vector3d(s0_eigen.x(), s0_eigen.y(), s0_eigen.z());
+    fastvec::Vector3D rotation_axis_cuda = fastvec::make_vector3d(
+      rotation_axis_eigen.x(), rotation_axis_eigen.y(), rotation_axis_eigen.z());
+
+    logger.info("Testing Kabsch transformation for first reflection");
+
+    // Test on the first reflection
+    size_t refl_id = 0;
+
+    // Get reflection centroid data
+    Eigen::Vector3d s1_c_eigen(
+      s1_vectors(refl_id, 0), s1_vectors(refl_id, 1), s1_vectors(refl_id, 2));
+    fastvec::Vector3D s1_c_cuda =
+      fastvec::make_vector3d(s1_c_eigen.x(), s1_c_eigen.y(), s1_c_eigen.z());
+    double phi_c = phi_column(refl_id, 2);
+
+    logger.info("Reflection {} centroid: s1=({:.6f}, {:.6f}, {:.6f}), phi={:.6f}",
+                refl_id,
+                s1_c_eigen.x(),
+                s1_c_eigen.y(),
+                s1_c_eigen.z(),
+                phi_c);
+
+    // Create test voxels around the reflection centroid
+    std::vector<fastvec::Vector3D> test_s_pixels;
+    std::vector<scalar_t> test_phi_pixels;
+
+    // Test with a few voxel positions
+    const int num_test_voxels = 5;
+    for (int i = 0; i < num_test_voxels; ++i) {
+        // Create slight variations around the centroid
+        double x_offset = 100.0 + i * 10.0;  // pixels
+        double y_offset = 200.0 + i * 10.0;  // pixels
+        int z_offset = i;                    // frames
+
+        // Convert pixel coordinates to lab frame
+        std::array<double, 2> xy_mm = panel.px_to_mm(x_offset, y_offset);
+        Eigen::Vector3d lab_coord =
+          panel.get_d_matrix() * Eigen::Vector3d(xy_mm[0], xy_mm[1], 1.0);
+
+        // Convert to reciprocal space vector
+        Eigen::Vector3d s_pixel_eigen = lab_coord.normalized() / wl;
+        fastvec::Vector3D s_pixel_cuda = fastvec::make_vector3d(
+          s_pixel_eigen.x(), s_pixel_eigen.y(), s_pixel_eigen.z());
+
+        // Calculate phi for this voxel
+        double phi_pixel =
+          osc_start + (z_offset - image_range_start + 1.5) * osc_width / 180.0 * M_PI;
+
+        test_s_pixels.push_back(s_pixel_cuda);
+        test_phi_pixels.push_back(static_cast<scalar_t>(phi_pixel));
+
+        logger.info("Test voxel {}: pixel=({:.1f}, {:.1f}, {}), phi={:.6f}",
+                    i,
+                    x_offset,
+                    y_offset,
+                    z_offset,
+                    phi_pixel);
+    }
+
+    // Allocate output arrays
+    std::vector<fastvec::Vector3D> kabsch_results(num_test_voxels);
+    std::vector<scalar_t> s1_len_results(num_test_voxels);
+
+    // Call CUDA Kabsch transformation
+    logger.info("Calling compute_kabsch_transform with {} test voxels",
+                num_test_voxels);
+    compute_kabsch_transform(test_s_pixels.data(),
+                             test_phi_pixels.data(),
+                             s1_c_cuda,
+                             static_cast<scalar_t>(phi_c),
+                             s0_cuda,
+                             rotation_axis_cuda,
+                             kabsch_results.data(),
+                             s1_len_results.data(),
+                             num_test_voxels);
+
+    logger.info("Kabsch transformation completed successfully");
+
+    // Validate results
+    for (int i = 0; i < num_test_voxels; ++i) {
+        const auto &eps = kabsch_results[i];
+        scalar_t s1_len = s1_len_results[i];
+
+        logger.info("Voxel {}: Kabsch coords ε=({:.6f}, {:.6f}, {:.6f}), |s₁|={:.6f}",
+                    i,
+                    eps.x,
+                    eps.y,
+                    eps.z,
+                    s1_len);
+
+        // Check results
+
+        // Check that s1 length is positive
+        EXPECT_GT(s1_len, 0.0f) << "|s₁| should be positive for voxel " << i;
+    }
+
+    logger.info("Kabsch transformation test completed successfully");
+}
diff --git a/tests/test_integrator.py b/tests/test_integrator.py
new file mode 100644
index 00000000..2f752c73
--- /dev/null
+++ b/tests/test_integrator.py
@@ -0,0 +1,172 @@
+"""
+Tests for the fast-feedback integrator.
+
+Test command:
+bin/integrator
+../integrator/tests/comp/filtered.refl
+../integrator/tests/comp/indexed.expt --sigma_m 0.002 --sigma_b 0.0002
+"""
+
+import os
+import subprocess
+from pathlib import Path
+
+import h5py
+
+
+def test_integrator_basic(dials_data, tmp_path):
+    """Test basic integrator functionality with reflection and experiment files."""
+
+    integrator_path: str | Path | None = os.getenv("INTEGRATOR")
+    assert integrator_path, "INTEGRATOR environment variable not set"
+
+    # /dls/science/groups/scisoft/DIALS/dials_data/thaumatin_i03_rotation/
+    dials_data_path = dials_data("thaumatin_i03_rotation", pathlib=True)
+    assert dials_data_path
+
+    integration_data_path = Path("tests/thaumatin_i03_rotation.data").resolve()
+    assert integration_data_path.exists()
+
+    # sigma b: 0.022098 degrees = 0.0003856828581 radians
+    # sigma m: 0.105343 degrees = 0.0018385821939 radians
+    command = [
+        integrator_path,
+        "--reflection",
+        str(integration_data_path / "indexed.refl"),
+        "--experiment",
+        str(integration_data_path / "indexed.expt"),
+        "--images",
+        dials_data_path / "thau_2_1.nxs",
+        "--sigma_m",
+        "0.00183",
+        "--sigma_b",
+        "0.00038",
+    ]
+
+    proc = subprocess.run(
+        command,
+        capture_output=True,
+        cwd=tmp_path,
+    )
+    # assert not proc.stderr, f"Error: {proc.stderr.decode()}"
+    assert proc.returncode == 0, f"Non-zero return code: {proc.returncode}"
+
+    # Validate output files exist
+    output_reflections = tmp_path / "output_reflections.h5"
+    voxel_kabsch_data = tmp_path / "voxel_kabsch_data.h5"
+
+    assert output_reflections.exists(), "output_reflections.h5 not created"
+    assert voxel_kabsch_data.exists(), "voxel_kabsch_data.h5 not created"
+
+    # Validate reflection data structure
+    with h5py.File(output_reflections, "r") as f:
+        # Navigate to the processing group
+        processing_group = f["dials/processing/group_0"]
+
+        # Check that computed_bbox column was added
+        assert "computed_bbox" in processing_group, (
+            "computed_bbox column not found in output"
+        )
+
+        # Check original columns are preserved
+        assert "s1" in processing_group, "s1 column missing from output"
+        assert "xyzcal.mm" in processing_group, "xyzcal.mm column missing from output"
+        assert "bbox" in processing_group, "bbox column missing from output"
+
+        # Validate computed_bbox has correct shape
+        bbox_data = processing_group["bbox"][:]
+        computed_bbox_data = processing_group["computed_bbox"][:]
+
+        assert bbox_data.shape == computed_bbox_data.shape, (
+            f"Shape mismatch: bbox {bbox_data.shape} vs computed_bbox {computed_bbox_data.shape}"
+        )
+
+        # Basic sanity check: bounding boxes should have reasonable values
+        assert computed_bbox_data.shape[1] == 6, "Computed bbox should have 6 columns"
+
+        # Check that x_min < x_max and y_min < y_max for computed bboxes
+        x_min, x_max = computed_bbox_data[:, 0], computed_bbox_data[:, 1]
+        y_min, y_max = computed_bbox_data[:, 2], computed_bbox_data[:, 3]
+        z_min, z_max = computed_bbox_data[:, 4], computed_bbox_data[:, 5]
+
+        # Debug: Print some computed bbox values
+        print("\nComputed bbox statistics:")
+        print(f"Number of reflections: {len(computed_bbox_data)}")
+        print(f"x_min range: {x_min.min():.6e} to {x_min.max():.6e}")
+        print(f"x_max range: {x_max.min():.6e} to {x_max.max():.6e}")
+        print(f"y_min range: {y_min.min():.6e} to {y_min.max():.6e}")
+        print(f"y_max range: {y_max.min():.6e} to {y_max.max():.6e}")
+        print(f"z_min range: {z_min.min():.6e} to {z_min.max():.6e}")
+        print(f"z_max range: {z_max.min():.6e} to {z_max.max():.6e}")
+
+        # Show first few computed bboxes
+        print("\nFirst 5 computed bboxes:")
+        for i in range(min(5, len(computed_bbox_data))):
+            print(
+                f"  [{i}]: x=({x_min[i]:.6e}, {x_max[i]:.6e}), y=({y_min[i]:.6e}, {y_max[i]:.6e}), z=({z_min[i]:.6e}, {z_max[i]:.6e})"
+            )
+
+        # Compare with original bbox for context
+        orig_bbox = processing_group["bbox"][:]
+        orig_x_min, orig_x_max = orig_bbox[:, 0], orig_bbox[:, 1]
+        print(
+            f"\nOriginal bbox x_min range: {orig_x_min.min():.6e} to {orig_x_min.max():.6e}"
+        )
+        print(
+            f"Original bbox x_max range: {orig_x_max.min():.6e} to {orig_x_max.max():.6e}"
+        )
+
+        # Count invalid bboxes
+        invalid_x = ~(x_min < x_max)
+        invalid_y = ~(y_min < y_max)
+        invalid_z = ~(z_min < z_max)
+        print("\nInvalid bbox counts:")
+        print(f"Invalid x bounds: {invalid_x.sum()} / {len(computed_bbox_data)}")
+        print(f"Invalid y bounds: {invalid_y.sum()} / {len(computed_bbox_data)}")
+        print(f"Invalid z bounds: {invalid_z.sum()} / {len(computed_bbox_data)}")
+
+        assert (x_min < x_max).all(), "Invalid x bounds in computed bbox"
+        assert (y_min < y_max).all(), "Invalid y bounds in computed bbox"
+        assert (z_min < z_max).all(), "Invalid z bounds in computed bbox"
+
+    # Validate voxel data structure
+    with h5py.File(voxel_kabsch_data, "r") as f:
+        # Navigate to the processing group
+        processing_group = f["dials/processing/group_0"]
+
+        # Check expected columns exist
+        expected_columns = [
+            "kabsch_coordinates",
+            "reflection_id",
+            "pixel_coordinates",
+            "voxel_s1_length",
+        ]
+        for col in expected_columns:
+            assert col in processing_group, f"Column {col} not found in voxel data"
+
+        # Check data consistency
+        kabsch_coords = processing_group["kabsch_coordinates"][:]
+        reflection_ids = processing_group["reflection_id"][:]
+        pixel_coords = processing_group["pixel_coordinates"][:]
+        s1_lengths = processing_group["voxel_s1_length"][:]
+
+        num_voxels = len(reflection_ids)
+        assert kabsch_coords.shape == (num_voxels, 3), (
+            f"Kabsch coordinates shape mismatch: {kabsch_coords.shape}"
+        )
+        assert pixel_coords.shape == (num_voxels, 3), (
+            f"Pixel coordinates shape mismatch: {pixel_coords.shape}"
+        )
+        assert len(s1_lengths) == num_voxels, (
+            f"S1 lengths count mismatch: {len(s1_lengths)} vs {num_voxels}"
+        )
+
+        # Validate that s1_lengths are positive
+        assert (s1_lengths > 0).all(), "All s1 lengths should be positive"
+
+        # Validate that reflection IDs are non-negative integers
+        assert (reflection_ids >= 0).all(), "Reflection IDs should be non-negative"
+
+        print(
+            f"Successfully validated {num_voxels} voxels across {len(set(reflection_ids.flatten()))} reflections"
+        )