diff --git a/book/src/puzzle_24/warp_sum.md b/book/src/puzzle_24/warp_sum.md
index d9ba62b9..17ed4a12 100644
--- a/book/src/puzzle_24/warp_sum.md
+++ b/book/src/puzzle_24/warp_sum.md
@@ -136,11 +136,13 @@ total = warp_sum(partial_product)
 
 ```mojo
 if lane_id() == 0:
-    output[0] = total
+    output[global_i // WARP_SIZE] = total
 ```
 
 **Why only lane 0?** All lanes have the same `total` value after `warp_sum()`, but we only want to write once to avoid race conditions.
 
+**Why not write to `output[0]`?** Flexibility, function can be used in cases where there is more than one warp. i.e. The result from each warp is written to the unique location `global_i // WARP_SIZE`.
+
 **`lane_id()`:** Returns 0-31 (NVIDIA) or 0-63 (AMD) - identifies which lane within the warp.
 
 </div>
@@ -280,10 +282,10 @@ else:
 total = warp_sum(partial_product)
 
 if lane_id() == 0:
-    output.store[1](0, 0, total)
+    output.store[1](idx // WARP_SIZE, 0, total)
 ```
 
-**Storage pattern:** `output.store[1](0, 0, total)` stores 1 element at position (0, 0) in the output tensor.
+**Storage pattern:** `output.store[1](idx // WARP_SIZE, 0, total)` stores 1 element at position `(idx // WARP_SIZE, 0)` in the output tensor.
 
 **Same warp logic:** `warp_sum()` and lane 0 writing work identically in functional approach.
 
@@ -444,42 +446,38 @@ Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)
 Running traditional_65536x
 Running simple_warp_65536x
 Running functional_warp_65536x
-| name                   | met (ms)           | iters |
-| ---------------------- | ------------------ | ----- |
-| traditional_1x         | 1.0263419180000002 | 1000  |
-| simple_warp_1x         | 1.025756103        | 1000  |
-| functional_warp_1x     | 1.027618774        | 1000  |
-| traditional_4x         | 1.026372558        | 1000  |
-| simple_warp_4x         | 1.0274108880000001 | 1000  |
-| functional_warp_4x     | 1.0272440180000002 | 1000  |
-| traditional_32x        | 1.029869628        | 1000  |
-| simple_warp_32x        | 1.029203002        | 1000  |
-| functional_warp_32x    | 1.0293903800000002 | 1000  |
-| traditional_256x       | 1.055470581        | 1000  |
-| simple_warp_256x       | 1.0549002680000001 | 1000  |
-| functional_warp_256x   | 1.054106567        | 1000  |
-| traditional_2048x      | 1.170297851        | 1000  |
-| simple_warp_2048x      | 1.1691909169999999 | 1000  |
-| functional_warp_2048x  | 1.166839843        | 1000  |
-| traditional_16384x     | 6.470711037837837  | 185   |
-| simple_warp_16384x     | 6.482257572972973  | 185   |
-| functional_warp_16384x | 6.414636946524065  | 187   |
-| traditional_65536x     | 22.48350437735849  | 53    |
-| simple_warp_65536x     | 22.561115754716983 | 53    |
-| functional_warp_65536x | 22.399149188679246 | 53    |
+| name                   | met (ms)              | iters |
+| ---------------------- | --------------------- | ----- |
+| traditional_1x         | 0.00460128            | 100   |
+| simple_warp_1x         | 0.00574047            | 100   |
+| functional_warp_1x     | 0.00484192            | 100   |
+| traditional_4x         | 0.00492671            | 100   |
+| simple_warp_4x         | 0.00485247            | 100   |
+| functional_warp_4x     | 0.00587679            | 100   |
+| traditional_32x        | 0.0062406399999999996 | 100   |
+| simple_warp_32x        | 0.0054918400000000004 | 100   |
+| functional_warp_32x    | 0.00552447            | 100   |
+| traditional_256x       | 0.0050614300000000004 | 100   |
+| simple_warp_256x       | 0.00488768            | 100   |
+| functional_warp_256x   | 0.00461472            | 100   |
+| traditional_2048x      | 0.01120031            | 100   |
+| simple_warp_2048x      | 0.00884383            | 100   |
+| functional_warp_2048x  | 0.007038720000000001  | 100   |
+| traditional_16384x     | 0.038533750000000005  | 100   |
+| simple_warp_16384x     | 0.0323264             | 100   |
+| functional_warp_16384x | 0.01674271            | 100   |
+| traditional_65536x     | 0.19784991999999998   | 100   |
+| simple_warp_65536x     | 0.12870176            | 100   |
+| functional_warp_65536x | 0.048680310000000004  | 100   |
 
 Benchmarks completed!
 
 WARP OPERATIONS PERFORMANCE ANALYSIS:
    GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD (WARP_SIZE=64)
-   - 1 x WARP_SIZE: Single warp baseline
-   - 4 x WARP_SIZE: Few warps, warp overhead visible
-   - 32 x WARP_SIZE: Medium scale, warp benefits emerge
-   - 256 x WARP_SIZE: Large scale, dramatic warp advantages
-   - 2048 x WARP_SIZE: Massive scale, warp operations dominate
+   - 1,...,256 x WARP_SIZE: Grid size too small to benchmark
+   - 2048 x WARP_SIZE: Warp primative benefits emerge
    - 16384 x WARP_SIZE: Large scale (512K-1M elements)
    - 65536 x WARP_SIZE: Massive scale (2M-4M elements)
-   - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!
 
    Expected Results at Large Scales:
    • Traditional: Slower due to more barrier overhead
diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo
index 5f7d3816..455c1ed2 100644
--- a/problems/p24/p24.mojo
+++ b/problems/p24/p24.mojo
@@ -1,6 +1,6 @@
 from math import ceildiv
 from gpu import thread_idx, block_idx, block_dim, barrier, lane_id
-from gpu.host import DeviceContext
+from gpu.host import DeviceContext, HostBuffer, DeviceBuffer
 from gpu.warp import sum as warp_sum, WARP_SIZE
 from algorithm.functional import elementwise
 from layout import Layout, LayoutTensor
@@ -8,6 +8,7 @@ from layout.tensor_builder import LayoutTensorBuild as tb
 from utils import IndexList
 from sys import argv, simd_width_of, size_of, align_of
 from testing import assert_equal
+from random import random_float64
 from benchmark import (
     Bench,
     BenchConfig,
@@ -23,7 +24,7 @@ from benchmark import (
 # ANCHOR: traditional_approach_from_p12
 alias SIZE = WARP_SIZE
 alias BLOCKS_PER_GRID = (1, 1)
-alias THREADS_PER_BLOCK = (WARP_SIZE, 1)  # optimal choice for warp kernel
+alias THREADS_PER_BLOCK = (WARP_SIZE, 1)
 alias dtype = DType.float32
 alias SIMD_WIDTH = simd_width_of[dtype]()
 alias in_layout = Layout.row_major(SIZE)
@@ -51,7 +52,7 @@ fn traditional_dot_product_p12_style[
 
     barrier()
 
-    stride = SIZE // 2
+    stride = WARP_SIZE // 2
     while stride > 0:
         if local_i < stride:
             shared[local_i] += shared[local_i + stride]
@@ -59,15 +60,13 @@ fn traditional_dot_product_p12_style[
         stride //= 2
 
     if local_i == 0:
-        output[0] = shared[0]
+        output[global_i // WARP_SIZE] = shared[0]
 
 
 # ANCHOR_END: traditional_approach_from_p12
 
-# ANCHOR: simple_warp_kernel
-from gpu.warp import sum as warp_sum
-
 
+# ANCHOR: simple_warp_kernel
 fn simple_warp_dot_product[
     in_layout: Layout, out_layout: Layout, size: Int
 ](
@@ -84,11 +83,14 @@ fn simple_warp_dot_product[
 
 # ANCHOR: functional_warp_approach
 fn functional_warp_dot_product[
-    layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int
+    layout: Layout,
+    out_layout: Layout,
+    dtype: DType,
+    simd_width: Int,
+    rank: Int,
+    size: Int,
 ](
-    output: LayoutTensor[
-        mut=True, dtype, Layout.row_major(1), MutableAnyOrigin
-    ],
+    output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
     b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
     ctx: DeviceContext,
@@ -102,332 +104,350 @@ fn functional_warp_dot_product[
         print("idx:", idx)
         # FILL IN (10 lines at most)
 
-    # Launch exactly WARP_SIZE threads (one warp) to process all elements
-    elementwise[compute_dot_product, 1, target="gpu"](WARP_SIZE, ctx)
+    # Launch exactly size == WARP_SIZE threads (one warp) to process all elements
+    elementwise[compute_dot_product, 1, target="gpu"](size, ctx)
 
 
 # ANCHOR_END: functional_warp_approach
 
 
+fn expected_output[
+    dtype: DType, n_warps: Int
+](
+    expected: HostBuffer[dtype],
+    a: DeviceBuffer[dtype],
+    b: DeviceBuffer[dtype],
+) raises:
+    with a.map_to_host() as a_host, b.map_to_host() as b_host:
+        for i_warp in range(n_warps):
+            i_warp_in_buff = WARP_SIZE * i_warp
+            var warp_sum: Scalar[dtype] = 0
+            for i in range(WARP_SIZE):
+                warp_sum += (
+                    a_host[i_warp_in_buff + i] * b_host[i_warp_in_buff + i]
+                )
+            expected[i_warp] = warp_sum
+
+
+fn rand_int[
+    dtype: DType, size: Int
+](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises:
+    with buff.map_to_host() as buff_host:
+        for i in range(size):
+            buff_host[i] = Int(random_float64(min, max))
+
+
+fn check_result[
+    dtype: DType, size: Int, print_result: Bool = False
+](actual: DeviceBuffer[dtype], expected: HostBuffer[dtype]) raises:
+    with actual.map_to_host() as actual_host:
+        if print_result:
+            print("=== RESULT ===")
+            print("actual:", actual_host)
+            print("expected:", expected)
+        for i in range(size):
+            assert_equal(actual_host[i], expected[i])
+
+
 @parameter
 @always_inline
-fn benchmark_simple_warp_parameterized[test_size: Int](mut b: Bencher) raises:
-    @parameter
-    @always_inline
-    fn simple_warp_workflow(ctx: DeviceContext) raises:
-        alias test_layout = Layout.row_major(test_size)
-        alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1)
+fn benchmark_simple_warp_parameterized[
+    test_size: Int
+](mut bencher: Bencher) raises:
+    alias n_warps = test_size // WARP_SIZE
+    alias in_layout = Layout.row_major(test_size)
+    alias out_layout = Layout.row_major(n_warps)
+    alias n_threads = WARP_SIZE
+    alias n_blocks = (ceildiv(test_size, n_threads), 1)
+
+    bench_ctx = DeviceContext()
 
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
-        b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+    a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    expected = bench_ctx.enqueue_create_host_buffer[dtype](
+        n_warps
+    ).enqueue_fill(0)
 
-        with a.map_to_host() as a_host, b_buf.map_to_host() as b_host:
-            for i in range(test_size):
-                a_host[i] = i
-                b_host[i] = i
+    rand_int[dtype, test_size](a)
+    rand_int[dtype, test_size](b)
+    expected_output[dtype, n_warps](expected, a, b)
 
-        out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr())
-        a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr())
+    a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+    b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+    out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
 
+    @parameter
+    @always_inline
+    fn traditional_workflow(ctx: DeviceContext) raises:
         ctx.enqueue_function[
-            simple_warp_dot_product[test_layout, out_layout, test_size]
+            simple_warp_dot_product[in_layout, out_layout, test_size]
         ](
             out_tensor,
             a_tensor,
             b_tensor,
-            grid_dim=test_blocks,
-            block_dim=THREADS_PER_BLOCK,
+            grid_dim=n_blocks,
+            block_dim=n_threads,
         )
-        keep(out.unsafe_ptr())
-        keep(a.unsafe_ptr())
-        keep(b_buf.unsafe_ptr())
-        ctx.synchronize()
 
-    bench_ctx = DeviceContext()
-    b.iter_custom[simple_warp_workflow](bench_ctx)
+    bencher.iter_custom[traditional_workflow](bench_ctx)
+    check_result[dtype, n_warps](out, expected)
+    keep(out.unsafe_ptr())
+    keep(a.unsafe_ptr())
+    keep(b.unsafe_ptr())
+    bench_ctx.synchronize()
 
 
 @parameter
 @always_inline
 fn benchmark_functional_warp_parameterized[
     test_size: Int
-](mut b: Bencher) raises:
-    @parameter
-    @always_inline
-    fn functional_warp_workflow(ctx: DeviceContext) raises:
-        alias test_layout = Layout.row_major(test_size)
+](mut bencher: Bencher) raises:
+    alias n_warps = test_size // WARP_SIZE
+    alias in_layout = Layout.row_major(test_size)
+    alias out_layout = Layout.row_major(n_warps)
 
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
-        b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    bench_ctx = DeviceContext()
 
-        with a.map_to_host() as a_host, b_buf.map_to_host() as b_host:
-            for i in range(test_size):
-                a_host[i] = i
-                b_host[i] = i
+    out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+    a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    expected = bench_ctx.enqueue_create_host_buffer[dtype](
+        n_warps
+    ).enqueue_fill(0)
 
-        a_tensor = LayoutTensor[mut=False, dtype, test_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[mut=False, dtype, test_layout](
-            b_buf.unsafe_ptr()
-        )
-        out_tensor = LayoutTensor[mut=True, dtype, Layout.row_major(1)](
-            out.unsafe_ptr()
-        )
+    rand_int[dtype, test_size](a)
+    rand_int[dtype, test_size](b)
+    expected_output[dtype, n_warps](expected, a, b)
 
+    a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+    b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+    out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+
+    @parameter
+    @always_inline
+    fn functional_warp_workflow(ctx: DeviceContext) raises:
         functional_warp_dot_product[
-            test_layout, dtype, SIMD_WIDTH, 1, test_size
+            in_layout, out_layout, dtype, SIMD_WIDTH, 1, test_size
         ](out_tensor, a_tensor, b_tensor, ctx)
-        keep(out.unsafe_ptr())
-        keep(a.unsafe_ptr())
-        keep(b_buf.unsafe_ptr())
-        ctx.synchronize()
 
-    bench_ctx = DeviceContext()
-    b.iter_custom[functional_warp_workflow](bench_ctx)
+    bencher.iter_custom[functional_warp_workflow](bench_ctx)
+    check_result[dtype, n_warps](out, expected)
+    keep(out.unsafe_ptr())
+    keep(a.unsafe_ptr())
+    keep(b.unsafe_ptr())
+    bench_ctx.synchronize()
 
 
 @parameter
 @always_inline
-fn benchmark_traditional_parameterized[test_size: Int](mut b: Bencher) raises:
-    @parameter
-    @always_inline
-    fn traditional_workflow(ctx: DeviceContext) raises:
-        alias test_layout = Layout.row_major(test_size)
-        alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1)
+fn benchmark_traditional_parameterized[
+    test_size: Int
+](mut bencher: Bencher) raises:
+    alias n_warps = test_size // WARP_SIZE
+    alias in_layout = Layout.row_major(test_size)
+    alias out_layout = Layout.row_major(n_warps)
+    alias n_blocks = (ceildiv(test_size, WARP_SIZE), 1)
+
+    bench_ctx = DeviceContext()
 
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
-        b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+    a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    expected = bench_ctx.enqueue_create_host_buffer[dtype](
+        n_warps
+    ).enqueue_fill(0)
 
-        with a.map_to_host() as a_host, b_buf.map_to_host() as b_host:
-            for i in range(test_size):
-                a_host[i] = i
-                b_host[i] = i
+    rand_int[dtype, test_size](a)
+    rand_int[dtype, test_size](b)
+    expected_output[dtype, n_warps](expected, a, b)
 
-        out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr())
-        a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr())
+    a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+    b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+    out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
 
+    @parameter
+    @always_inline
+    fn traditional_workflow(ctx: DeviceContext) raises:
         ctx.enqueue_function[
-            traditional_dot_product_p12_style[
-                test_layout, out_layout, test_size
-            ]
+            traditional_dot_product_p12_style[in_layout, out_layout, test_size]
         ](
             out_tensor,
             a_tensor,
             b_tensor,
-            grid_dim=test_blocks,
+            grid_dim=n_blocks,
             block_dim=THREADS_PER_BLOCK,
         )
-        keep(out.unsafe_ptr())
-        keep(a.unsafe_ptr())
-        keep(b_buf.unsafe_ptr())
-        ctx.synchronize()
 
-    bench_ctx = DeviceContext()
-    b.iter_custom[traditional_workflow](bench_ctx)
+    bencher.iter_custom[traditional_workflow](bench_ctx)
+    check_result[dtype, n_warps](out, expected)
+    keep(out.unsafe_ptr())
+    keep(a.unsafe_ptr())
+    keep(b.unsafe_ptr())
+    bench_ctx.synchronize()
 
 
 def main():
-    with DeviceContext() as ctx:
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
-        b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
-
-        with a.map_to_host() as a_host, b.map_to_host() as b_host:
-            for i in range(SIZE):
-                a_host[i] = i
-                b_host[i] = i
-
-        out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
-        a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
-
+    if argv()[1] != "--benchmark":
         print("SIZE:", SIZE)
         print("WARP_SIZE:", WARP_SIZE)
         print("SIMD_WIDTH:", SIMD_WIDTH)
-        if argv()[1] == "--traditional":
-            ctx.enqueue_function[
-                traditional_dot_product_p12_style[in_layout, out_layout, SIZE]
-            ](
-                out_tensor,
-                a_tensor,
-                b_tensor,
-                grid_dim=BLOCKS_PER_GRID,
-                block_dim=THREADS_PER_BLOCK,
-            )
-        elif argv()[1] == "--kernel":
-            ctx.enqueue_function[
-                simple_warp_dot_product[in_layout, out_layout, SIZE]
-            ](
-                out_tensor,
-                a_tensor,
-                b_tensor,
-                grid_dim=BLOCKS_PER_GRID,
-                block_dim=THREADS_PER_BLOCK,
-            )
-
-        elif argv()[1] == "--functional":
-            functional_warp_dot_product[in_layout, dtype, SIMD_WIDTH, 1, SIZE](
-                out_tensor, a_tensor, b_tensor, ctx
-            )
-
-        elif argv()[1] == "--benchmark":
-            print("-" * 80)
-            bench_config = BenchConfig(max_iters=100)
-            bench = Bench(bench_config.copy())
-
-            print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1")
-            bench.bench_function[
-                benchmark_traditional_parameterized[WARP_SIZE]
-            ](BenchId("traditional_1x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[WARP_SIZE]
-            ](BenchId("simple_warp_1x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[WARP_SIZE]
-            ](BenchId("functional_warp_1x"))
-
-            print("-" * 80)
-            print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4")
-            bench.bench_function[
-                benchmark_traditional_parameterized[4 * WARP_SIZE]
-            ](BenchId("traditional_4x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[4 * WARP_SIZE]
-            ](BenchId("simple_warp_4x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[4 * WARP_SIZE]
-            ](BenchId("functional_warp_4x"))
-
-            print("-" * 80)
-            print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32")
-            bench.bench_function[
-                benchmark_traditional_parameterized[32 * WARP_SIZE]
-            ](BenchId("traditional_32x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[32 * WARP_SIZE]
-            ](BenchId("simple_warp_32x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[32 * WARP_SIZE]
-            ](BenchId("functional_warp_32x"))
-
-            print("-" * 80)
-            print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256")
-            bench.bench_function[
-                benchmark_traditional_parameterized[256 * WARP_SIZE]
-            ](BenchId("traditional_256x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[256 * WARP_SIZE]
-            ](BenchId("simple_warp_256x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[256 * WARP_SIZE]
-            ](BenchId("functional_warp_256x"))
-
-            print("-" * 80)
-            print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048")
-            bench.bench_function[
-                benchmark_traditional_parameterized[2048 * WARP_SIZE]
-            ](BenchId("traditional_2048x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[2048 * WARP_SIZE]
-            ](BenchId("simple_warp_2048x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[2048 * WARP_SIZE]
-            ](BenchId("functional_warp_2048x"))
-
-            print("-" * 80)
-            print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)")
-            bench.bench_function[
-                benchmark_traditional_parameterized[16384 * WARP_SIZE]
-            ](BenchId("traditional_16384x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[16384 * WARP_SIZE]
-            ](BenchId("simple_warp_16384x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[16384 * WARP_SIZE]
-            ](BenchId("functional_warp_16384x"))
-
-            print("-" * 80)
-            print(
-                "Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)"
-            )
-            bench.bench_function[
-                benchmark_traditional_parameterized[65536 * WARP_SIZE]
-            ](BenchId("traditional_65536x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[65536 * WARP_SIZE]
-            ](BenchId("simple_warp_65536x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[65536 * WARP_SIZE]
-            ](BenchId("functional_warp_65536x"))
-
-            print(bench)
-            print("Benchmarks completed!")
-            print()
-            print("🚀 WARP OPERATIONS PERFORMANCE ANALYSIS:")
-            print(
-                "   GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD"
-                " (WARP_SIZE=64)"
+        alias n_warps = SIZE // WARP_SIZE
+        with DeviceContext() as ctx:
+            out = ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+            a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
+            b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
+            expected = ctx.enqueue_create_host_buffer[dtype](
+                n_warps
+            ).enqueue_fill(0)
+
+            out_tensor = LayoutTensor[mut=True, dtype, out_layout](
+                out.unsafe_ptr()
             )
-            print("   - 1 x WARP_SIZE: Single warp baseline")
-            print("   - 4 x WARP_SIZE: Few warps, warp overhead visible")
-            print("   - 32 x WARP_SIZE: Medium scale, warp benefits emerge")
-            print("   - 256 x WARP_SIZE: Large scale, dramatic warp advantages")
-            print(
-                "   - 2048 x WARP_SIZE: Massive scale, warp operations dominate"
-            )
-            print("   - 16384 x WARP_SIZE: Large scale (512K-1M elements)")
-            print("   - 65536 x WARP_SIZE: Massive scale (2M-4M elements)")
-            print(
-                "   - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!"
-            )
-            print()
-            print("   Expected Results at Large Scales:")
-            print("   • Traditional: Slower due to more barrier overhead")
-            print(
-                "   • Warp operations: Faster, scale better with problem size"
-            )
-            print("   • Memory bandwidth becomes the limiting factor")
-            return
-
-        else:
-            print(
-                "Usage: --traditional | --kernel | --functional | --benchmark"
-            )
-            return
-
-        expected = ctx.enqueue_create_host_buffer[dtype](1).enqueue_fill(0)
-        ctx.synchronize()
-
-        with a.map_to_host() as a_host, b.map_to_host() as b_host:
-            for i in range(SIZE):
-                expected[0] += a_host[i] * b_host[i]
+            a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+            b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+
+            with a.map_to_host() as a_host, b.map_to_host() as b_host:
+                for i in range(SIZE):
+                    a_host[i] = i
+                    b_host[i] = i
+
+            if argv()[1] == "--traditional":
+                ctx.enqueue_function[
+                    traditional_dot_product_p12_style[
+                        in_layout, out_layout, SIZE
+                    ]
+                ](
+                    out_tensor,
+                    a_tensor,
+                    b_tensor,
+                    grid_dim=BLOCKS_PER_GRID,
+                    block_dim=THREADS_PER_BLOCK,
+                )
+            elif argv()[1] == "--kernel":
+                ctx.enqueue_function[
+                    simple_warp_dot_product[in_layout, out_layout, SIZE]
+                ](
+                    out_tensor,
+                    a_tensor,
+                    b_tensor,
+                    grid_dim=BLOCKS_PER_GRID,
+                    block_dim=THREADS_PER_BLOCK,
+                )
+            elif argv()[1] == "--functional":
+                functional_warp_dot_product[
+                    in_layout, out_layout, dtype, SIMD_WIDTH, 1, SIZE
+                ](out_tensor, a_tensor, b_tensor, ctx)
+            expected_output[dtype, n_warps](expected, a, b)
+            check_result[dtype, n_warps, True](out, expected)
+            ctx.synchronize()
+    elif argv()[1] == "--benchmark":
+        print("-" * 80)
+        bench_config = BenchConfig(max_iters=10, num_warmup_iters=1)
+        bench = Bench(bench_config.copy())
+
+        print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1")
+        bench.bench_function[benchmark_traditional_parameterized[WARP_SIZE]](
+            BenchId("traditional_1x")
+        )
+        bench.bench_function[benchmark_simple_warp_parameterized[WARP_SIZE]](
+            BenchId("simple_warp_1x")
+        )
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[WARP_SIZE]
+        ](BenchId("functional_warp_1x"))
+
+        print("-" * 80)
+        print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4")
+        bench.bench_function[
+            benchmark_traditional_parameterized[4 * WARP_SIZE]
+        ](BenchId("traditional_4x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[4 * WARP_SIZE]
+        ](BenchId("simple_warp_4x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[4 * WARP_SIZE]
+        ](BenchId("functional_warp_4x"))
+
+        print("-" * 80)
+        print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32")
+        bench.bench_function[
+            benchmark_traditional_parameterized[32 * WARP_SIZE]
+        ](BenchId("traditional_32x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[32 * WARP_SIZE]
+        ](BenchId("simple_warp_32x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[32 * WARP_SIZE]
+        ](BenchId("functional_warp_32x"))
+
+        print("-" * 80)
+        print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256")
+        bench.bench_function[
+            benchmark_traditional_parameterized[256 * WARP_SIZE]
+        ](BenchId("traditional_256x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[256 * WARP_SIZE]
+        ](BenchId("simple_warp_256x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[256 * WARP_SIZE]
+        ](BenchId("functional_warp_256x"))
+
+        print("-" * 80)
+        print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048")
+        bench.bench_function[
+            benchmark_traditional_parameterized[2048 * WARP_SIZE]
+        ](BenchId("traditional_2048x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[2048 * WARP_SIZE]
+        ](BenchId("simple_warp_2048x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[2048 * WARP_SIZE]
+        ](BenchId("functional_warp_2048x"))
+
+        print("-" * 80)
+        print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)")
+        bench.bench_function[
+            benchmark_traditional_parameterized[16384 * WARP_SIZE]
+        ](BenchId("traditional_16384x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[16384 * WARP_SIZE]
+        ](BenchId("simple_warp_16384x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[16384 * WARP_SIZE]
+        ](BenchId("functional_warp_16384x"))
+
+        print("-" * 80)
+        print("Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)")
+        bench.bench_function[
+            benchmark_traditional_parameterized[65536 * WARP_SIZE]
+        ](BenchId("traditional_65536x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[65536 * WARP_SIZE]
+        ](BenchId("simple_warp_65536x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[65536 * WARP_SIZE]
+        ](BenchId("functional_warp_65536x"))
+
+        print(bench)
+        print("Benchmarks completed!")
+        print()
+        print("WARP OPERATIONS PERFORMANCE ANALYSIS:")
+        print(
+            "   GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD (WARP_SIZE=64)"
+        )
+        print("   - 1,...,256 x WARP_SIZE: Grid size too small to benchmark")
+        print("   - 2048 x WARP_SIZE: Warp primative benefits emerge")
+        print("   - 16384 x WARP_SIZE: Large scale (512K-1M elements)")
+        print("   - 65536 x WARP_SIZE: Massive scale (2M-4M elements)")
+        print("   - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!")
+        print()
+        print("   Expected Results at Large Scales:")
+        print("   • Traditional: Slower due to more barrier overhead")
+        print("   • Warp operations: Faster, scale better with problem size")
+        print("   • Memory bandwidth becomes the limiting factor")
+        return
 
-        with out.map_to_host() as out_host:
-            print("=== RESULT ===")
-            print("out:", out_host[0])
-            print("expected:", expected[0])
-            assert_equal(out_host[0], expected[0])
-
-        if len(argv()) == 1 or argv()[1] == "--kernel":
-            print()
-            print(
-                "🚀 Notice how simple the warp version is compared to p10.mojo!"
-            )
-            print(
-                "   Same kernel structure, but warp_sum() replaces all the"
-                " complexity!"
-            )
-        elif argv()[1] == "--functional":
-            print()
-            print(
-                "🔧 Functional approach shows modern Mojo style with warp"
-                " operations!"
-            )
-            print(
-                "   Clean, composable, and still leverages warp hardware"
-                " primitives!"
-            )
+    else:
+        print("Usage: --traditional | --kernel | --functional | --benchmark")
+        return
diff --git a/solutions/p24/p24.mojo b/solutions/p24/p24.mojo
index fac2255e..12687639 100644
--- a/solutions/p24/p24.mojo
+++ b/solutions/p24/p24.mojo
@@ -1,6 +1,6 @@
 from math import ceildiv
 from gpu import thread_idx, block_idx, block_dim, barrier, lane_id
-from gpu.host import DeviceContext
+from gpu.host import DeviceContext, HostBuffer, DeviceBuffer
 from gpu.warp import sum as warp_sum, WARP_SIZE
 from algorithm.functional import elementwise
 from layout import Layout, LayoutTensor
@@ -8,6 +8,7 @@ from layout.tensor_builder import LayoutTensorBuild as tb
 from utils import IndexList
 from sys import argv, simd_width_of, size_of, align_of
 from testing import assert_equal
+from random import random_float64
 from benchmark import (
     Bench,
     BenchConfig,
@@ -29,8 +30,8 @@ alias in_layout = Layout.row_major(SIZE)
 alias out_layout = Layout.row_major(1)
 
 
-# ANCHOR: traditional_approach_from_p10
-fn traditional_dot_product_p10_style[
+# ANCHOR: traditional_approach_from_p12
+fn traditional_dot_product_p12_style[
     in_layout: Layout, out_layout: Layout, size: Int
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
@@ -38,7 +39,7 @@ fn traditional_dot_product_p10_style[
     b: LayoutTensor[mut=False, dtype, in_layout],
 ):
     """
-    This is the complex approach from p10_layout_tensor.mojo - kept for comparison.
+    This is the complex approach from p12_layout_tensor.mojo - kept for comparison.
     """
     shared = tb[dtype]().row_major[WARP_SIZE]().shared().alloc()
     global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -51,7 +52,7 @@ fn traditional_dot_product_p10_style[
 
     barrier()
 
-    stride = SIZE // 2
+    stride = WARP_SIZE // 2
     while stride > 0:
         if local_i < stride:
             shared[local_i] += shared[local_i + stride]
@@ -59,7 +60,7 @@ fn traditional_dot_product_p10_style[
         stride //= 2
 
     if local_i == 0:
-        output[0] = shared[0]
+        output[global_i // WARP_SIZE] = shared[0]
 
 
 # ANCHOR_END: traditional_approach_from_p10
@@ -85,7 +86,7 @@ fn simple_warp_dot_product[
 
     # Only lane 0 writes the result (all lanes have the same total)
     if lane_id() == 0:
-        output[0] = total
+        output[global_i // WARP_SIZE] = total
 
 
 # ANCHOR_END: simple_warp_kernel_solution
@@ -93,11 +94,14 @@ fn simple_warp_dot_product[
 
 # ANCHOR: functional_warp_approach_solution
 fn functional_warp_dot_product[
-    layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int
+    layout: Layout,
+    out_layout: Layout,
+    dtype: DType,
+    simd_width: Int,
+    rank: Int,
+    size: Int,
 ](
-    output: LayoutTensor[
-        mut=True, dtype, Layout.row_major(1), MutableAnyOrigin
-    ],
+    output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
     b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
     ctx: DeviceContext,
@@ -114,7 +118,7 @@ fn functional_warp_dot_product[
         if idx < size:
             a_val = a.load[1](idx, 0)
             b_val = b.load[1](idx, 0)
-            partial_product = (a_val * b_val).reduce_add()
+            partial_product = a_val * b_val
         else:
             partial_product = 0.0
 
@@ -123,314 +127,352 @@ fn functional_warp_dot_product[
 
         # Only lane 0 writes the result (all lanes have the same total)
         if lane_id() == 0:
-            output.store[1](0, 0, total)
+            output.store[1](idx // WARP_SIZE, 0, total)
 
-    # Launch exactly WARP_SIZE threads (one warp) to process all elements
-    elementwise[compute_dot_product, 1, target="gpu"](WARP_SIZE, ctx)
+    # Launch exactly size == WARP_SIZE threads (one warp) to process all elements
+    elementwise[compute_dot_product, 1, target="gpu"](size, ctx)
 
 
 # ANCHOR_END: functional_warp_approach_solution
 
 
+fn expected_output[
+    dtype: DType, n_warps: Int
+](
+    expected: HostBuffer[dtype],
+    a: DeviceBuffer[dtype],
+    b: DeviceBuffer[dtype],
+) raises:
+    with a.map_to_host() as a_host, b.map_to_host() as b_host:
+        for i_warp in range(n_warps):
+            i_warp_in_buff = WARP_SIZE * i_warp
+            var warp_sum: Scalar[dtype] = 0
+            for i in range(WARP_SIZE):
+                warp_sum += (
+                    a_host[i_warp_in_buff + i] * b_host[i_warp_in_buff + i]
+                )
+            expected[i_warp] = warp_sum
+
+
+fn rand_int[
+    dtype: DType, size: Int
+](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises:
+    with buff.map_to_host() as buff_host:
+        for i in range(size):
+            buff_host[i] = Int(random_float64(min, max))
+
+
+fn check_result[
+    dtype: DType, size: Int, print_result: Bool = False
+](actual: DeviceBuffer[dtype], expected: HostBuffer[dtype]) raises:
+    with actual.map_to_host() as actual_host:
+        if print_result:
+            print("=== RESULT ===")
+            print("actual:", actual_host)
+            print("expected:", expected)
+        for i in range(size):
+            assert_equal(actual_host[i], expected[i])
+
+
 @parameter
 @always_inline
-fn benchmark_simple_warp_parameterized[test_size: Int](mut b: Bencher) raises:
-    @parameter
-    @always_inline
-    fn simple_warp_workflow(ctx: DeviceContext) raises:
-        alias test_layout = Layout.row_major(test_size)
-        alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1)
+fn benchmark_simple_warp_parameterized[
+    test_size: Int
+](mut bencher: Bencher) raises:
+    alias n_warps = test_size // WARP_SIZE
+    alias in_layout = Layout.row_major(test_size)
+    alias out_layout = Layout.row_major(n_warps)
+    alias n_threads = WARP_SIZE
+    alias n_blocks = (ceildiv(test_size, n_threads), 1)
 
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
-        b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    bench_ctx = DeviceContext()
+
+    out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+    a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    expected = bench_ctx.enqueue_create_host_buffer[dtype](
+        n_warps
+    ).enqueue_fill(0)
 
-        with a.map_to_host() as a_host, b_buf.map_to_host() as b_host:
-            for i in range(test_size):
-                a_host[i] = i
-                b_host[i] = i
+    rand_int[dtype, test_size](a)
+    rand_int[dtype, test_size](b)
+    expected_output[dtype, n_warps](expected, a, b)
 
-        out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr())
-        a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr())
+    a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+    b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+    out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
 
+    @parameter
+    @always_inline
+    fn traditional_workflow(ctx: DeviceContext) raises:
         ctx.enqueue_function[
-            simple_warp_dot_product[test_layout, out_layout, test_size]
+            simple_warp_dot_product[in_layout, out_layout, test_size]
         ](
             out_tensor,
             a_tensor,
             b_tensor,
-            grid_dim=test_blocks,
-            block_dim=THREADS_PER_BLOCK,
+            grid_dim=n_blocks,
+            block_dim=n_threads,
         )
-        keep(out.unsafe_ptr())
-        keep(a.unsafe_ptr())
-        keep(b_buf.unsafe_ptr())
-        ctx.synchronize()
 
-    bench_ctx = DeviceContext()
-    b.iter_custom[simple_warp_workflow](bench_ctx)
+    bencher.iter_custom[traditional_workflow](bench_ctx)
+    check_result[dtype, n_warps](out, expected)
+    keep(out.unsafe_ptr())
+    keep(a.unsafe_ptr())
+    keep(b.unsafe_ptr())
+    bench_ctx.synchronize()
 
 
 @parameter
 @always_inline
 fn benchmark_functional_warp_parameterized[
     test_size: Int
-](mut b: Bencher) raises:
-    @parameter
-    @always_inline
-    fn functional_warp_workflow(ctx: DeviceContext) raises:
-        alias test_layout = Layout.row_major(test_size)
+](mut bencher: Bencher) raises:
+    alias n_warps = test_size // WARP_SIZE
+    alias in_layout = Layout.row_major(test_size)
+    alias out_layout = Layout.row_major(n_warps)
 
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
-        b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    bench_ctx = DeviceContext()
 
-        with a.map_to_host() as a_host, b_buf.map_to_host() as b_host:
-            for i in range(test_size):
-                a_host[i] = i
-                b_host[i] = i
+    out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+    a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    expected = bench_ctx.enqueue_create_host_buffer[dtype](
+        n_warps
+    ).enqueue_fill(0)
 
-        a_tensor = LayoutTensor[mut=False, dtype, test_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[mut=False, dtype, test_layout](
-            b_buf.unsafe_ptr()
-        )
-        output_tensor = LayoutTensor[mut=True, dtype, Layout.row_major(1)](
-            out.unsafe_ptr()
-        )
+    rand_int[dtype, test_size](a)
+    rand_int[dtype, test_size](b)
+    expected_output[dtype, n_warps](expected, a, b)
+
+    a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+    b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+    out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
 
+    @parameter
+    @always_inline
+    fn functional_warp_workflow(ctx: DeviceContext) raises:
         functional_warp_dot_product[
-            test_layout, dtype, SIMD_WIDTH, 1, test_size
-        ](output_tensor, a_tensor, b_tensor, ctx)
-        keep(out.unsafe_ptr())
-        keep(a.unsafe_ptr())
-        keep(b_buf.unsafe_ptr())
-        ctx.synchronize()
+            in_layout, out_layout, dtype, SIMD_WIDTH, 1, test_size
+        ](out_tensor, a_tensor, b_tensor, ctx)
 
-    bench_ctx = DeviceContext()
-    b.iter_custom[functional_warp_workflow](bench_ctx)
+    bencher.iter_custom[functional_warp_workflow](bench_ctx)
+    check_result[dtype, n_warps](out, expected)
+    keep(out.unsafe_ptr())
+    keep(a.unsafe_ptr())
+    keep(b.unsafe_ptr())
+    bench_ctx.synchronize()
 
 
 @parameter
 @always_inline
-fn benchmark_traditional_parameterized[test_size: Int](mut b: Bencher) raises:
-    @parameter
-    @always_inline
-    fn traditional_workflow(ctx: DeviceContext) raises:
-        alias test_layout = Layout.row_major(test_size)
-        alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1)
+fn benchmark_traditional_parameterized[
+    test_size: Int
+](mut bencher: Bencher) raises:
+    alias n_warps = test_size // WARP_SIZE
+    alias in_layout = Layout.row_major(test_size)
+    alias out_layout = Layout.row_major(n_warps)
+    alias n_blocks = (ceildiv(test_size, WARP_SIZE), 1)
+
+    bench_ctx = DeviceContext()
 
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
-        b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+    a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0)
+    expected = bench_ctx.enqueue_create_host_buffer[dtype](
+        n_warps
+    ).enqueue_fill(0)
 
-        with a.map_to_host() as a_host, b_buf.map_to_host() as b_host:
-            for i in range(test_size):
-                a_host[i] = i
-                b_host[i] = i
+    rand_int[dtype, test_size](a)
+    rand_int[dtype, test_size](b)
+    expected_output[dtype, n_warps](expected, a, b)
 
-        out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr())
-        a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr())
+    a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+    b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+    out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
 
+    @parameter
+    @always_inline
+    fn traditional_workflow(ctx: DeviceContext) raises:
         ctx.enqueue_function[
-            traditional_dot_product_p10_style[
-                test_layout, out_layout, test_size
-            ]
+            traditional_dot_product_p12_style[in_layout, out_layout, test_size]
         ](
             out_tensor,
             a_tensor,
             b_tensor,
-            grid_dim=test_blocks,
+            grid_dim=n_blocks,
             block_dim=THREADS_PER_BLOCK,
         )
-        keep(out.unsafe_ptr())
-        keep(a.unsafe_ptr())
-        keep(b_buf.unsafe_ptr())
-        ctx.synchronize()
 
-    bench_ctx = DeviceContext()
-    b.iter_custom[traditional_workflow](bench_ctx)
+    bencher.iter_custom[traditional_workflow](bench_ctx)
+    check_result[dtype, n_warps](out, expected)
+    keep(out.unsafe_ptr())
+    keep(a.unsafe_ptr())
+    keep(b.unsafe_ptr())
+    bench_ctx.synchronize()
 
 
 def main():
-    with DeviceContext() as ctx:
-        out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0)
-        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
-        b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
-
-        with a.map_to_host() as a_host, b.map_to_host() as b_host:
-            for i in range(SIZE):
-                a_host[i] = i
-                b_host[i] = i
-
-        out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
-        a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
-
+    if argv()[1] != "--benchmark":
         print("SIZE:", SIZE)
         print("WARP_SIZE:", WARP_SIZE)
         print("SIMD_WIDTH:", SIMD_WIDTH)
-        if argv()[1] == "--traditional":
-            ctx.enqueue_function[
-                traditional_dot_product_p10_style[in_layout, out_layout, SIZE]
-            ](
-                out_tensor,
-                a_tensor,
-                b_tensor,
-                grid_dim=BLOCKS_PER_GRID,
-                block_dim=THREADS_PER_BLOCK,
-            )
-        elif argv()[1] == "--kernel":
-            ctx.enqueue_function[
-                simple_warp_dot_product[in_layout, out_layout, SIZE]
-            ](
-                out_tensor,
-                a_tensor,
-                b_tensor,
-                grid_dim=BLOCKS_PER_GRID,
-                block_dim=THREADS_PER_BLOCK,
-            )
-
-        elif argv()[1] == "--functional":
-            functional_warp_dot_product[in_layout, dtype, SIMD_WIDTH, 1, SIZE](
-                out_tensor, a_tensor, b_tensor, ctx
-            )
-
-        elif argv()[1] == "--benchmark":
-            print("-" * 80)
-            bench_config = BenchConfig(max_iters=100)
-            bench = Bench(bench_config.copy())
-
-            print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1")
-            bench.bench_function[
-                benchmark_traditional_parameterized[WARP_SIZE]
-            ](BenchId("traditional_1x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[WARP_SIZE]
-            ](BenchId("simple_warp_1x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[WARP_SIZE]
-            ](BenchId("functional_warp_1x"))
-
-            print("-" * 80)
-            print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4")
-            bench.bench_function[
-                benchmark_traditional_parameterized[4 * WARP_SIZE]
-            ](BenchId("traditional_4x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[4 * WARP_SIZE]
-            ](BenchId("simple_warp_4x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[4 * WARP_SIZE]
-            ](BenchId("functional_warp_4x"))
-
-            print("-" * 80)
-            print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32")
-            bench.bench_function[
-                benchmark_traditional_parameterized[32 * WARP_SIZE]
-            ](BenchId("traditional_32x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[32 * WARP_SIZE]
-            ](BenchId("simple_warp_32x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[32 * WARP_SIZE]
-            ](BenchId("functional_warp_32x"))
-
-            print("-" * 80)
-            print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256")
-            bench.bench_function[
-                benchmark_traditional_parameterized[256 * WARP_SIZE]
-            ](BenchId("traditional_256x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[256 * WARP_SIZE]
-            ](BenchId("simple_warp_256x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[256 * WARP_SIZE]
-            ](BenchId("functional_warp_256x"))
-
-            print("-" * 80)
-            print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048")
-            bench.bench_function[
-                benchmark_traditional_parameterized[2048 * WARP_SIZE]
-            ](BenchId("traditional_2048x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[2048 * WARP_SIZE]
-            ](BenchId("simple_warp_2048x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[2048 * WARP_SIZE]
-            ](BenchId("functional_warp_2048x"))
-
-            print("-" * 80)
-            print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)")
-            bench.bench_function[
-                benchmark_traditional_parameterized[16384 * WARP_SIZE]
-            ](BenchId("traditional_16384x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[16384 * WARP_SIZE]
-            ](BenchId("simple_warp_16384x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[16384 * WARP_SIZE]
-            ](BenchId("functional_warp_16384x"))
-
-            print("-" * 80)
-            print(
-                "Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)"
+        alias n_warps = SIZE // WARP_SIZE
+        with DeviceContext() as ctx:
+            out = ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0)
+            a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
+            b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
+            expected = ctx.enqueue_create_host_buffer[dtype](
+                n_warps
+            ).enqueue_fill(0)
+
+            out_tensor = LayoutTensor[mut=True, dtype, out_layout](
+                out.unsafe_ptr()
             )
-            bench.bench_function[
-                benchmark_traditional_parameterized[65536 * WARP_SIZE]
-            ](BenchId("traditional_65536x"))
-            bench.bench_function[
-                benchmark_simple_warp_parameterized[65536 * WARP_SIZE]
-            ](BenchId("simple_warp_65536x"))
-            bench.bench_function[
-                benchmark_functional_warp_parameterized[65536 * WARP_SIZE]
-            ](BenchId("functional_warp_65536x"))
-
-            print(bench)
-            print("Benchmarks completed!")
-            print()
-            print("WARP OPERATIONS PERFORMANCE ANALYSIS:")
-            print(
-                "   GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD"
-                " (WARP_SIZE=64)"
-            )
-            print("   - 1 x WARP_SIZE: Single warp baseline")
-            print("   - 4 x WARP_SIZE: Few warps, warp overhead visible")
-            print("   - 32 x WARP_SIZE: Medium scale, warp benefits emerge")
-            print("   - 256 x WARP_SIZE: Large scale, dramatic warp advantages")
-            print(
-                "   - 2048 x WARP_SIZE: Massive scale, warp operations dominate"
-            )
-            print("   - 16384 x WARP_SIZE: Large scale (512K-1M elements)")
-            print("   - 65536 x WARP_SIZE: Massive scale (2M-4M elements)")
-            print(
-                "   - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!"
-            )
-            print()
-            print("   Expected Results at Large Scales:")
-            print("   • Traditional: Slower due to more barrier overhead")
-            print(
-                "   • Warp operations: Faster, scale better with problem size"
-            )
-            print("   • Memory bandwidth becomes the limiting factor")
-            return
-
-        else:
-            print(
-                "Usage: --traditional | --kernel | --functional | --benchmark"
-            )
-            return
-
-        expected = ctx.enqueue_create_host_buffer[dtype](1).enqueue_fill(0)
-        ctx.synchronize()
-
-        with a.map_to_host() as a_host, b.map_to_host() as b_host:
-            for i in range(SIZE):
-                expected[0] += a_host[i] * b_host[i]
+            a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
+            b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+
+            with a.map_to_host() as a_host, b.map_to_host() as b_host:
+                for i in range(SIZE):
+                    a_host[i] = i
+                    b_host[i] = i
+
+            if argv()[1] == "--traditional":
+                ctx.enqueue_function[
+                    traditional_dot_product_p12_style[
+                        in_layout, out_layout, SIZE
+                    ]
+                ](
+                    out_tensor,
+                    a_tensor,
+                    b_tensor,
+                    grid_dim=BLOCKS_PER_GRID,
+                    block_dim=THREADS_PER_BLOCK,
+                )
+            elif argv()[1] == "--kernel":
+                ctx.enqueue_function[
+                    simple_warp_dot_product[in_layout, out_layout, SIZE]
+                ](
+                    out_tensor,
+                    a_tensor,
+                    b_tensor,
+                    grid_dim=BLOCKS_PER_GRID,
+                    block_dim=THREADS_PER_BLOCK,
+                )
+            elif argv()[1] == "--functional":
+                functional_warp_dot_product[
+                    in_layout, out_layout, dtype, SIMD_WIDTH, 1, SIZE
+                ](out_tensor, a_tensor, b_tensor, ctx)
+            expected_output[dtype, n_warps](expected, a, b)
+            check_result[dtype, n_warps, True](out, expected)
+            ctx.synchronize()
+    elif argv()[1] == "--benchmark":
+        print("-" * 80)
+        bench_config = BenchConfig(max_iters=100, num_warmup_iters=1)
+        bench = Bench(bench_config.copy())
+
+        print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1")
+        bench.bench_function[benchmark_traditional_parameterized[WARP_SIZE]](
+            BenchId("traditional_1x")
+        )
+        bench.bench_function[benchmark_simple_warp_parameterized[WARP_SIZE]](
+            BenchId("simple_warp_1x")
+        )
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[WARP_SIZE]
+        ](BenchId("functional_warp_1x"))
+
+        print("-" * 80)
+        print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4")
+        bench.bench_function[
+            benchmark_traditional_parameterized[4 * WARP_SIZE]
+        ](BenchId("traditional_4x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[4 * WARP_SIZE]
+        ](BenchId("simple_warp_4x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[4 * WARP_SIZE]
+        ](BenchId("functional_warp_4x"))
+
+        print("-" * 80)
+        print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32")
+        bench.bench_function[
+            benchmark_traditional_parameterized[32 * WARP_SIZE]
+        ](BenchId("traditional_32x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[32 * WARP_SIZE]
+        ](BenchId("simple_warp_32x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[32 * WARP_SIZE]
+        ](BenchId("functional_warp_32x"))
+
+        print("-" * 80)
+        print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256")
+        bench.bench_function[
+            benchmark_traditional_parameterized[256 * WARP_SIZE]
+        ](BenchId("traditional_256x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[256 * WARP_SIZE]
+        ](BenchId("simple_warp_256x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[256 * WARP_SIZE]
+        ](BenchId("functional_warp_256x"))
+
+        print("-" * 80)
+        print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048")
+        bench.bench_function[
+            benchmark_traditional_parameterized[2048 * WARP_SIZE]
+        ](BenchId("traditional_2048x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[2048 * WARP_SIZE]
+        ](BenchId("simple_warp_2048x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[2048 * WARP_SIZE]
+        ](BenchId("functional_warp_2048x"))
+
+        print("-" * 80)
+        print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)")
+        bench.bench_function[
+            benchmark_traditional_parameterized[16384 * WARP_SIZE]
+        ](BenchId("traditional_16384x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[16384 * WARP_SIZE]
+        ](BenchId("simple_warp_16384x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[16384 * WARP_SIZE]
+        ](BenchId("functional_warp_16384x"))
+
+        print("-" * 80)
+        print("Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)")
+        bench.bench_function[
+            benchmark_traditional_parameterized[65536 * WARP_SIZE]
+        ](BenchId("traditional_65536x"))
+        bench.bench_function[
+            benchmark_simple_warp_parameterized[65536 * WARP_SIZE]
+        ](BenchId("simple_warp_65536x"))
+        bench.bench_function[
+            benchmark_functional_warp_parameterized[65536 * WARP_SIZE]
+        ](BenchId("functional_warp_65536x"))
+
+        print(bench)
+        print("Benchmarks completed!")
+        print()
+        print("WARP OPERATIONS PERFORMANCE ANALYSIS:")
+        print(
+            "   GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD (WARP_SIZE=64)"
+        )
+        print("   - 1,...,256 x WARP_SIZE: Grid size too small to benchmark")
+        print("   - 2048 x WARP_SIZE: Warp primative benefits emerge")
+        print("   - 16384 x WARP_SIZE: Large scale (512K-1M elements)")
+        print("   - 65536 x WARP_SIZE: Massive scale (2M-4M elements)")
+        print("   - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!")
+        print()
+        print("   Expected Results at Large Scales:")
+        print("   • Traditional: Slower due to more barrier overhead")
+        print("   • Warp operations: Faster, scale better with problem size")
+        print("   • Memory bandwidth becomes the limiting factor")
+        return
 
-        with out.map_to_host() as out_host:
-            print("=== RESULT ===")
-            print("out:", out_host[0])
-            print("expected:", expected[0])
-            assert_equal(out_host[0], expected[0])
+    else:
+        print("Usage: --traditional | --kernel | --functional | --benchmark")
+        return