diff --git a/book/src/puzzle_24/warp_sum.md b/book/src/puzzle_24/warp_sum.md index d9ba62b9..17ed4a12 100644 --- a/book/src/puzzle_24/warp_sum.md +++ b/book/src/puzzle_24/warp_sum.md @@ -136,11 +136,13 @@ total = warp_sum(partial_product) ```mojo if lane_id() == 0: - output[0] = total + output[global_i // WARP_SIZE] = total ``` **Why only lane 0?** All lanes have the same `total` value after `warp_sum()`, but we only want to write once to avoid race conditions. +**Why not write to `output[0]`?** Flexibility, function can be used in cases where there is more than one warp. i.e. The result from each warp is written to the unique location `global_i // WARP_SIZE`. + **`lane_id()`:** Returns 0-31 (NVIDIA) or 0-63 (AMD) - identifies which lane within the warp. @@ -280,10 +282,10 @@ else: total = warp_sum(partial_product) if lane_id() == 0: - output.store[1](0, 0, total) + output.store[1](idx // WARP_SIZE, 0, total) ``` -**Storage pattern:** `output.store[1](0, 0, total)` stores 1 element at position (0, 0) in the output tensor. +**Storage pattern:** `output.store[1](idx // WARP_SIZE, 0, total)` stores 1 element at position `(idx // WARP_SIZE, 0)` in the output tensor. **Same warp logic:** `warp_sum()` and lane 0 writing work identically in functional approach. @@ -444,42 +446,38 @@ Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale) Running traditional_65536x Running simple_warp_65536x Running functional_warp_65536x -| name | met (ms) | iters | -| ---------------------- | ------------------ | ----- | -| traditional_1x | 1.0263419180000002 | 1000 | -| simple_warp_1x | 1.025756103 | 1000 | -| functional_warp_1x | 1.027618774 | 1000 | -| traditional_4x | 1.026372558 | 1000 | -| simple_warp_4x | 1.0274108880000001 | 1000 | -| functional_warp_4x | 1.0272440180000002 | 1000 | -| traditional_32x | 1.029869628 | 1000 | -| simple_warp_32x | 1.029203002 | 1000 | -| functional_warp_32x | 1.0293903800000002 | 1000 | -| traditional_256x | 1.055470581 | 1000 | -| simple_warp_256x | 1.0549002680000001 | 1000 | -| functional_warp_256x | 1.054106567 | 1000 | -| traditional_2048x | 1.170297851 | 1000 | -| simple_warp_2048x | 1.1691909169999999 | 1000 | -| functional_warp_2048x | 1.166839843 | 1000 | -| traditional_16384x | 6.470711037837837 | 185 | -| simple_warp_16384x | 6.482257572972973 | 185 | -| functional_warp_16384x | 6.414636946524065 | 187 | -| traditional_65536x | 22.48350437735849 | 53 | -| simple_warp_65536x | 22.561115754716983 | 53 | -| functional_warp_65536x | 22.399149188679246 | 53 | +| name | met (ms) | iters | +| ---------------------- | --------------------- | ----- | +| traditional_1x | 0.00460128 | 100 | +| simple_warp_1x | 0.00574047 | 100 | +| functional_warp_1x | 0.00484192 | 100 | +| traditional_4x | 0.00492671 | 100 | +| simple_warp_4x | 0.00485247 | 100 | +| functional_warp_4x | 0.00587679 | 100 | +| traditional_32x | 0.0062406399999999996 | 100 | +| simple_warp_32x | 0.0054918400000000004 | 100 | +| functional_warp_32x | 0.00552447 | 100 | +| traditional_256x | 0.0050614300000000004 | 100 | +| simple_warp_256x | 0.00488768 | 100 | +| functional_warp_256x | 0.00461472 | 100 | +| traditional_2048x | 0.01120031 | 100 | +| simple_warp_2048x | 0.00884383 | 100 | +| functional_warp_2048x | 0.007038720000000001 | 100 | +| traditional_16384x | 0.038533750000000005 | 100 | +| simple_warp_16384x | 0.0323264 | 100 | +| functional_warp_16384x | 0.01674271 | 100 | +| traditional_65536x | 0.19784991999999998 | 100 | +| simple_warp_65536x | 0.12870176 | 100 | +| functional_warp_65536x | 0.048680310000000004 | 100 | Benchmarks completed! WARP OPERATIONS PERFORMANCE ANALYSIS: GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD (WARP_SIZE=64) - - 1 x WARP_SIZE: Single warp baseline - - 4 x WARP_SIZE: Few warps, warp overhead visible - - 32 x WARP_SIZE: Medium scale, warp benefits emerge - - 256 x WARP_SIZE: Large scale, dramatic warp advantages - - 2048 x WARP_SIZE: Massive scale, warp operations dominate + - 1,...,256 x WARP_SIZE: Grid size too small to benchmark + - 2048 x WARP_SIZE: Warp primative benefits emerge - 16384 x WARP_SIZE: Large scale (512K-1M elements) - 65536 x WARP_SIZE: Massive scale (2M-4M elements) - - Note: AMD GPUs process 2 x elements per warp vs NVIDIA! Expected Results at Large Scales: • Traditional: Slower due to more barrier overhead diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo index 5f7d3816..455c1ed2 100644 --- a/problems/p24/p24.mojo +++ b/problems/p24/p24.mojo @@ -1,6 +1,6 @@ from math import ceildiv from gpu import thread_idx, block_idx, block_dim, barrier, lane_id -from gpu.host import DeviceContext +from gpu.host import DeviceContext, HostBuffer, DeviceBuffer from gpu.warp import sum as warp_sum, WARP_SIZE from algorithm.functional import elementwise from layout import Layout, LayoutTensor @@ -8,6 +8,7 @@ from layout.tensor_builder import LayoutTensorBuild as tb from utils import IndexList from sys import argv, simd_width_of, size_of, align_of from testing import assert_equal +from random import random_float64 from benchmark import ( Bench, BenchConfig, @@ -23,7 +24,7 @@ from benchmark import ( # ANCHOR: traditional_approach_from_p12 alias SIZE = WARP_SIZE alias BLOCKS_PER_GRID = (1, 1) -alias THREADS_PER_BLOCK = (WARP_SIZE, 1) # optimal choice for warp kernel +alias THREADS_PER_BLOCK = (WARP_SIZE, 1) alias dtype = DType.float32 alias SIMD_WIDTH = simd_width_of[dtype]() alias in_layout = Layout.row_major(SIZE) @@ -51,7 +52,7 @@ fn traditional_dot_product_p12_style[ barrier() - stride = SIZE // 2 + stride = WARP_SIZE // 2 while stride > 0: if local_i < stride: shared[local_i] += shared[local_i + stride] @@ -59,15 +60,13 @@ fn traditional_dot_product_p12_style[ stride //= 2 if local_i == 0: - output[0] = shared[0] + output[global_i // WARP_SIZE] = shared[0] # ANCHOR_END: traditional_approach_from_p12 -# ANCHOR: simple_warp_kernel -from gpu.warp import sum as warp_sum - +# ANCHOR: simple_warp_kernel fn simple_warp_dot_product[ in_layout: Layout, out_layout: Layout, size: Int ]( @@ -84,11 +83,14 @@ fn simple_warp_dot_product[ # ANCHOR: functional_warp_approach fn functional_warp_dot_product[ - layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int + layout: Layout, + out_layout: Layout, + dtype: DType, + simd_width: Int, + rank: Int, + size: Int, ]( - output: LayoutTensor[ - mut=True, dtype, Layout.row_major(1), MutableAnyOrigin - ], + output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], ctx: DeviceContext, @@ -102,332 +104,350 @@ fn functional_warp_dot_product[ print("idx:", idx) # FILL IN (10 lines at most) - # Launch exactly WARP_SIZE threads (one warp) to process all elements - elementwise[compute_dot_product, 1, target="gpu"](WARP_SIZE, ctx) + # Launch exactly size == WARP_SIZE threads (one warp) to process all elements + elementwise[compute_dot_product, 1, target="gpu"](size, ctx) # ANCHOR_END: functional_warp_approach +fn expected_output[ + dtype: DType, n_warps: Int +]( + expected: HostBuffer[dtype], + a: DeviceBuffer[dtype], + b: DeviceBuffer[dtype], +) raises: + with a.map_to_host() as a_host, b.map_to_host() as b_host: + for i_warp in range(n_warps): + i_warp_in_buff = WARP_SIZE * i_warp + var warp_sum: Scalar[dtype] = 0 + for i in range(WARP_SIZE): + warp_sum += ( + a_host[i_warp_in_buff + i] * b_host[i_warp_in_buff + i] + ) + expected[i_warp] = warp_sum + + +fn rand_int[ + dtype: DType, size: Int +](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises: + with buff.map_to_host() as buff_host: + for i in range(size): + buff_host[i] = Int(random_float64(min, max)) + + +fn check_result[ + dtype: DType, size: Int, print_result: Bool = False +](actual: DeviceBuffer[dtype], expected: HostBuffer[dtype]) raises: + with actual.map_to_host() as actual_host: + if print_result: + print("=== RESULT ===") + print("actual:", actual_host) + print("expected:", expected) + for i in range(size): + assert_equal(actual_host[i], expected[i]) + + @parameter @always_inline -fn benchmark_simple_warp_parameterized[test_size: Int](mut b: Bencher) raises: - @parameter - @always_inline - fn simple_warp_workflow(ctx: DeviceContext) raises: - alias test_layout = Layout.row_major(test_size) - alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1) +fn benchmark_simple_warp_parameterized[ + test_size: Int +](mut bencher: Bencher) raises: + alias n_warps = test_size // WARP_SIZE + alias in_layout = Layout.row_major(test_size) + alias out_layout = Layout.row_major(n_warps) + alias n_threads = WARP_SIZE + alias n_blocks = (ceildiv(test_size, n_threads), 1) + + bench_ctx = DeviceContext() - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) - b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + expected = bench_ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) - with a.map_to_host() as a_host, b_buf.map_to_host() as b_host: - for i in range(test_size): - a_host[i] = i - b_host[i] = i + rand_int[dtype, test_size](a) + rand_int[dtype, test_size](b) + expected_output[dtype, n_warps](expected, a, b) - out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr()) - a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr()) + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) + @parameter + @always_inline + fn traditional_workflow(ctx: DeviceContext) raises: ctx.enqueue_function[ - simple_warp_dot_product[test_layout, out_layout, test_size] + simple_warp_dot_product[in_layout, out_layout, test_size] ]( out_tensor, a_tensor, b_tensor, - grid_dim=test_blocks, - block_dim=THREADS_PER_BLOCK, + grid_dim=n_blocks, + block_dim=n_threads, ) - keep(out.unsafe_ptr()) - keep(a.unsafe_ptr()) - keep(b_buf.unsafe_ptr()) - ctx.synchronize() - bench_ctx = DeviceContext() - b.iter_custom[simple_warp_workflow](bench_ctx) + bencher.iter_custom[traditional_workflow](bench_ctx) + check_result[dtype, n_warps](out, expected) + keep(out.unsafe_ptr()) + keep(a.unsafe_ptr()) + keep(b.unsafe_ptr()) + bench_ctx.synchronize() @parameter @always_inline fn benchmark_functional_warp_parameterized[ test_size: Int -](mut b: Bencher) raises: - @parameter - @always_inline - fn functional_warp_workflow(ctx: DeviceContext) raises: - alias test_layout = Layout.row_major(test_size) +](mut bencher: Bencher) raises: + alias n_warps = test_size // WARP_SIZE + alias in_layout = Layout.row_major(test_size) + alias out_layout = Layout.row_major(n_warps) - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) - b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + bench_ctx = DeviceContext() - with a.map_to_host() as a_host, b_buf.map_to_host() as b_host: - for i in range(test_size): - a_host[i] = i - b_host[i] = i + out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + expected = bench_ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) - a_tensor = LayoutTensor[mut=False, dtype, test_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[mut=False, dtype, test_layout]( - b_buf.unsafe_ptr() - ) - out_tensor = LayoutTensor[mut=True, dtype, Layout.row_major(1)]( - out.unsafe_ptr() - ) + rand_int[dtype, test_size](a) + rand_int[dtype, test_size](b) + expected_output[dtype, n_warps](expected, a, b) + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) + + @parameter + @always_inline + fn functional_warp_workflow(ctx: DeviceContext) raises: functional_warp_dot_product[ - test_layout, dtype, SIMD_WIDTH, 1, test_size + in_layout, out_layout, dtype, SIMD_WIDTH, 1, test_size ](out_tensor, a_tensor, b_tensor, ctx) - keep(out.unsafe_ptr()) - keep(a.unsafe_ptr()) - keep(b_buf.unsafe_ptr()) - ctx.synchronize() - bench_ctx = DeviceContext() - b.iter_custom[functional_warp_workflow](bench_ctx) + bencher.iter_custom[functional_warp_workflow](bench_ctx) + check_result[dtype, n_warps](out, expected) + keep(out.unsafe_ptr()) + keep(a.unsafe_ptr()) + keep(b.unsafe_ptr()) + bench_ctx.synchronize() @parameter @always_inline -fn benchmark_traditional_parameterized[test_size: Int](mut b: Bencher) raises: - @parameter - @always_inline - fn traditional_workflow(ctx: DeviceContext) raises: - alias test_layout = Layout.row_major(test_size) - alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1) +fn benchmark_traditional_parameterized[ + test_size: Int +](mut bencher: Bencher) raises: + alias n_warps = test_size // WARP_SIZE + alias in_layout = Layout.row_major(test_size) + alias out_layout = Layout.row_major(n_warps) + alias n_blocks = (ceildiv(test_size, WARP_SIZE), 1) + + bench_ctx = DeviceContext() - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) - b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + expected = bench_ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) - with a.map_to_host() as a_host, b_buf.map_to_host() as b_host: - for i in range(test_size): - a_host[i] = i - b_host[i] = i + rand_int[dtype, test_size](a) + rand_int[dtype, test_size](b) + expected_output[dtype, n_warps](expected, a, b) - out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr()) - a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr()) + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) + @parameter + @always_inline + fn traditional_workflow(ctx: DeviceContext) raises: ctx.enqueue_function[ - traditional_dot_product_p12_style[ - test_layout, out_layout, test_size - ] + traditional_dot_product_p12_style[in_layout, out_layout, test_size] ]( out_tensor, a_tensor, b_tensor, - grid_dim=test_blocks, + grid_dim=n_blocks, block_dim=THREADS_PER_BLOCK, ) - keep(out.unsafe_ptr()) - keep(a.unsafe_ptr()) - keep(b_buf.unsafe_ptr()) - ctx.synchronize() - bench_ctx = DeviceContext() - b.iter_custom[traditional_workflow](bench_ctx) + bencher.iter_custom[traditional_workflow](bench_ctx) + check_result[dtype, n_warps](out, expected) + keep(out.unsafe_ptr()) + keep(a.unsafe_ptr()) + keep(b.unsafe_ptr()) + bench_ctx.synchronize() def main(): - with DeviceContext() as ctx: - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) - b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) - - with a.map_to_host() as a_host, b.map_to_host() as b_host: - for i in range(SIZE): - a_host[i] = i - b_host[i] = i - - out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) - a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) - + if argv()[1] != "--benchmark": print("SIZE:", SIZE) print("WARP_SIZE:", WARP_SIZE) print("SIMD_WIDTH:", SIMD_WIDTH) - if argv()[1] == "--traditional": - ctx.enqueue_function[ - traditional_dot_product_p12_style[in_layout, out_layout, SIZE] - ]( - out_tensor, - a_tensor, - b_tensor, - grid_dim=BLOCKS_PER_GRID, - block_dim=THREADS_PER_BLOCK, - ) - elif argv()[1] == "--kernel": - ctx.enqueue_function[ - simple_warp_dot_product[in_layout, out_layout, SIZE] - ]( - out_tensor, - a_tensor, - b_tensor, - grid_dim=BLOCKS_PER_GRID, - block_dim=THREADS_PER_BLOCK, - ) - - elif argv()[1] == "--functional": - functional_warp_dot_product[in_layout, dtype, SIMD_WIDTH, 1, SIZE]( - out_tensor, a_tensor, b_tensor, ctx - ) - - elif argv()[1] == "--benchmark": - print("-" * 80) - bench_config = BenchConfig(max_iters=100) - bench = Bench(bench_config.copy()) - - print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1") - bench.bench_function[ - benchmark_traditional_parameterized[WARP_SIZE] - ](BenchId("traditional_1x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[WARP_SIZE] - ](BenchId("simple_warp_1x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[WARP_SIZE] - ](BenchId("functional_warp_1x")) - - print("-" * 80) - print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4") - bench.bench_function[ - benchmark_traditional_parameterized[4 * WARP_SIZE] - ](BenchId("traditional_4x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[4 * WARP_SIZE] - ](BenchId("simple_warp_4x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[4 * WARP_SIZE] - ](BenchId("functional_warp_4x")) - - print("-" * 80) - print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32") - bench.bench_function[ - benchmark_traditional_parameterized[32 * WARP_SIZE] - ](BenchId("traditional_32x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[32 * WARP_SIZE] - ](BenchId("simple_warp_32x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[32 * WARP_SIZE] - ](BenchId("functional_warp_32x")) - - print("-" * 80) - print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256") - bench.bench_function[ - benchmark_traditional_parameterized[256 * WARP_SIZE] - ](BenchId("traditional_256x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[256 * WARP_SIZE] - ](BenchId("simple_warp_256x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[256 * WARP_SIZE] - ](BenchId("functional_warp_256x")) - - print("-" * 80) - print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048") - bench.bench_function[ - benchmark_traditional_parameterized[2048 * WARP_SIZE] - ](BenchId("traditional_2048x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[2048 * WARP_SIZE] - ](BenchId("simple_warp_2048x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[2048 * WARP_SIZE] - ](BenchId("functional_warp_2048x")) - - print("-" * 80) - print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)") - bench.bench_function[ - benchmark_traditional_parameterized[16384 * WARP_SIZE] - ](BenchId("traditional_16384x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[16384 * WARP_SIZE] - ](BenchId("simple_warp_16384x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[16384 * WARP_SIZE] - ](BenchId("functional_warp_16384x")) - - print("-" * 80) - print( - "Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)" - ) - bench.bench_function[ - benchmark_traditional_parameterized[65536 * WARP_SIZE] - ](BenchId("traditional_65536x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[65536 * WARP_SIZE] - ](BenchId("simple_warp_65536x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[65536 * WARP_SIZE] - ](BenchId("functional_warp_65536x")) - - print(bench) - print("Benchmarks completed!") - print() - print("🚀 WARP OPERATIONS PERFORMANCE ANALYSIS:") - print( - " GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD" - " (WARP_SIZE=64)" + alias n_warps = SIZE // WARP_SIZE + with DeviceContext() as ctx: + out = ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) + b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) + expected = ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) + + out_tensor = LayoutTensor[mut=True, dtype, out_layout]( + out.unsafe_ptr() ) - print(" - 1 x WARP_SIZE: Single warp baseline") - print(" - 4 x WARP_SIZE: Few warps, warp overhead visible") - print(" - 32 x WARP_SIZE: Medium scale, warp benefits emerge") - print(" - 256 x WARP_SIZE: Large scale, dramatic warp advantages") - print( - " - 2048 x WARP_SIZE: Massive scale, warp operations dominate" - ) - print(" - 16384 x WARP_SIZE: Large scale (512K-1M elements)") - print(" - 65536 x WARP_SIZE: Massive scale (2M-4M elements)") - print( - " - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!" - ) - print() - print(" Expected Results at Large Scales:") - print(" • Traditional: Slower due to more barrier overhead") - print( - " • Warp operations: Faster, scale better with problem size" - ) - print(" • Memory bandwidth becomes the limiting factor") - return - - else: - print( - "Usage: --traditional | --kernel | --functional | --benchmark" - ) - return - - expected = ctx.enqueue_create_host_buffer[dtype](1).enqueue_fill(0) - ctx.synchronize() - - with a.map_to_host() as a_host, b.map_to_host() as b_host: - for i in range(SIZE): - expected[0] += a_host[i] * b_host[i] + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + + with a.map_to_host() as a_host, b.map_to_host() as b_host: + for i in range(SIZE): + a_host[i] = i + b_host[i] = i + + if argv()[1] == "--traditional": + ctx.enqueue_function[ + traditional_dot_product_p12_style[ + in_layout, out_layout, SIZE + ] + ]( + out_tensor, + a_tensor, + b_tensor, + grid_dim=BLOCKS_PER_GRID, + block_dim=THREADS_PER_BLOCK, + ) + elif argv()[1] == "--kernel": + ctx.enqueue_function[ + simple_warp_dot_product[in_layout, out_layout, SIZE] + ]( + out_tensor, + a_tensor, + b_tensor, + grid_dim=BLOCKS_PER_GRID, + block_dim=THREADS_PER_BLOCK, + ) + elif argv()[1] == "--functional": + functional_warp_dot_product[ + in_layout, out_layout, dtype, SIMD_WIDTH, 1, SIZE + ](out_tensor, a_tensor, b_tensor, ctx) + expected_output[dtype, n_warps](expected, a, b) + check_result[dtype, n_warps, True](out, expected) + ctx.synchronize() + elif argv()[1] == "--benchmark": + print("-" * 80) + bench_config = BenchConfig(max_iters=10, num_warmup_iters=1) + bench = Bench(bench_config.copy()) + + print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1") + bench.bench_function[benchmark_traditional_parameterized[WARP_SIZE]]( + BenchId("traditional_1x") + ) + bench.bench_function[benchmark_simple_warp_parameterized[WARP_SIZE]]( + BenchId("simple_warp_1x") + ) + bench.bench_function[ + benchmark_functional_warp_parameterized[WARP_SIZE] + ](BenchId("functional_warp_1x")) + + print("-" * 80) + print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4") + bench.bench_function[ + benchmark_traditional_parameterized[4 * WARP_SIZE] + ](BenchId("traditional_4x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[4 * WARP_SIZE] + ](BenchId("simple_warp_4x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[4 * WARP_SIZE] + ](BenchId("functional_warp_4x")) + + print("-" * 80) + print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32") + bench.bench_function[ + benchmark_traditional_parameterized[32 * WARP_SIZE] + ](BenchId("traditional_32x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[32 * WARP_SIZE] + ](BenchId("simple_warp_32x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[32 * WARP_SIZE] + ](BenchId("functional_warp_32x")) + + print("-" * 80) + print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256") + bench.bench_function[ + benchmark_traditional_parameterized[256 * WARP_SIZE] + ](BenchId("traditional_256x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[256 * WARP_SIZE] + ](BenchId("simple_warp_256x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[256 * WARP_SIZE] + ](BenchId("functional_warp_256x")) + + print("-" * 80) + print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048") + bench.bench_function[ + benchmark_traditional_parameterized[2048 * WARP_SIZE] + ](BenchId("traditional_2048x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[2048 * WARP_SIZE] + ](BenchId("simple_warp_2048x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[2048 * WARP_SIZE] + ](BenchId("functional_warp_2048x")) + + print("-" * 80) + print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)") + bench.bench_function[ + benchmark_traditional_parameterized[16384 * WARP_SIZE] + ](BenchId("traditional_16384x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[16384 * WARP_SIZE] + ](BenchId("simple_warp_16384x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[16384 * WARP_SIZE] + ](BenchId("functional_warp_16384x")) + + print("-" * 80) + print("Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)") + bench.bench_function[ + benchmark_traditional_parameterized[65536 * WARP_SIZE] + ](BenchId("traditional_65536x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[65536 * WARP_SIZE] + ](BenchId("simple_warp_65536x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[65536 * WARP_SIZE] + ](BenchId("functional_warp_65536x")) + + print(bench) + print("Benchmarks completed!") + print() + print("WARP OPERATIONS PERFORMANCE ANALYSIS:") + print( + " GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD (WARP_SIZE=64)" + ) + print(" - 1,...,256 x WARP_SIZE: Grid size too small to benchmark") + print(" - 2048 x WARP_SIZE: Warp primative benefits emerge") + print(" - 16384 x WARP_SIZE: Large scale (512K-1M elements)") + print(" - 65536 x WARP_SIZE: Massive scale (2M-4M elements)") + print(" - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!") + print() + print(" Expected Results at Large Scales:") + print(" • Traditional: Slower due to more barrier overhead") + print(" • Warp operations: Faster, scale better with problem size") + print(" • Memory bandwidth becomes the limiting factor") + return - with out.map_to_host() as out_host: - print("=== RESULT ===") - print("out:", out_host[0]) - print("expected:", expected[0]) - assert_equal(out_host[0], expected[0]) - - if len(argv()) == 1 or argv()[1] == "--kernel": - print() - print( - "🚀 Notice how simple the warp version is compared to p10.mojo!" - ) - print( - " Same kernel structure, but warp_sum() replaces all the" - " complexity!" - ) - elif argv()[1] == "--functional": - print() - print( - "🔧 Functional approach shows modern Mojo style with warp" - " operations!" - ) - print( - " Clean, composable, and still leverages warp hardware" - " primitives!" - ) + else: + print("Usage: --traditional | --kernel | --functional | --benchmark") + return diff --git a/solutions/p24/p24.mojo b/solutions/p24/p24.mojo index fac2255e..12687639 100644 --- a/solutions/p24/p24.mojo +++ b/solutions/p24/p24.mojo @@ -1,6 +1,6 @@ from math import ceildiv from gpu import thread_idx, block_idx, block_dim, barrier, lane_id -from gpu.host import DeviceContext +from gpu.host import DeviceContext, HostBuffer, DeviceBuffer from gpu.warp import sum as warp_sum, WARP_SIZE from algorithm.functional import elementwise from layout import Layout, LayoutTensor @@ -8,6 +8,7 @@ from layout.tensor_builder import LayoutTensorBuild as tb from utils import IndexList from sys import argv, simd_width_of, size_of, align_of from testing import assert_equal +from random import random_float64 from benchmark import ( Bench, BenchConfig, @@ -29,8 +30,8 @@ alias in_layout = Layout.row_major(SIZE) alias out_layout = Layout.row_major(1) -# ANCHOR: traditional_approach_from_p10 -fn traditional_dot_product_p10_style[ +# ANCHOR: traditional_approach_from_p12 +fn traditional_dot_product_p12_style[ in_layout: Layout, out_layout: Layout, size: Int ]( output: LayoutTensor[mut=True, dtype, out_layout], @@ -38,7 +39,7 @@ fn traditional_dot_product_p10_style[ b: LayoutTensor[mut=False, dtype, in_layout], ): """ - This is the complex approach from p10_layout_tensor.mojo - kept for comparison. + This is the complex approach from p12_layout_tensor.mojo - kept for comparison. """ shared = tb[dtype]().row_major[WARP_SIZE]().shared().alloc() global_i = block_dim.x * block_idx.x + thread_idx.x @@ -51,7 +52,7 @@ fn traditional_dot_product_p10_style[ barrier() - stride = SIZE // 2 + stride = WARP_SIZE // 2 while stride > 0: if local_i < stride: shared[local_i] += shared[local_i + stride] @@ -59,7 +60,7 @@ fn traditional_dot_product_p10_style[ stride //= 2 if local_i == 0: - output[0] = shared[0] + output[global_i // WARP_SIZE] = shared[0] # ANCHOR_END: traditional_approach_from_p10 @@ -85,7 +86,7 @@ fn simple_warp_dot_product[ # Only lane 0 writes the result (all lanes have the same total) if lane_id() == 0: - output[0] = total + output[global_i // WARP_SIZE] = total # ANCHOR_END: simple_warp_kernel_solution @@ -93,11 +94,14 @@ fn simple_warp_dot_product[ # ANCHOR: functional_warp_approach_solution fn functional_warp_dot_product[ - layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int + layout: Layout, + out_layout: Layout, + dtype: DType, + simd_width: Int, + rank: Int, + size: Int, ]( - output: LayoutTensor[ - mut=True, dtype, Layout.row_major(1), MutableAnyOrigin - ], + output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], ctx: DeviceContext, @@ -114,7 +118,7 @@ fn functional_warp_dot_product[ if idx < size: a_val = a.load[1](idx, 0) b_val = b.load[1](idx, 0) - partial_product = (a_val * b_val).reduce_add() + partial_product = a_val * b_val else: partial_product = 0.0 @@ -123,314 +127,352 @@ fn functional_warp_dot_product[ # Only lane 0 writes the result (all lanes have the same total) if lane_id() == 0: - output.store[1](0, 0, total) + output.store[1](idx // WARP_SIZE, 0, total) - # Launch exactly WARP_SIZE threads (one warp) to process all elements - elementwise[compute_dot_product, 1, target="gpu"](WARP_SIZE, ctx) + # Launch exactly size == WARP_SIZE threads (one warp) to process all elements + elementwise[compute_dot_product, 1, target="gpu"](size, ctx) # ANCHOR_END: functional_warp_approach_solution +fn expected_output[ + dtype: DType, n_warps: Int +]( + expected: HostBuffer[dtype], + a: DeviceBuffer[dtype], + b: DeviceBuffer[dtype], +) raises: + with a.map_to_host() as a_host, b.map_to_host() as b_host: + for i_warp in range(n_warps): + i_warp_in_buff = WARP_SIZE * i_warp + var warp_sum: Scalar[dtype] = 0 + for i in range(WARP_SIZE): + warp_sum += ( + a_host[i_warp_in_buff + i] * b_host[i_warp_in_buff + i] + ) + expected[i_warp] = warp_sum + + +fn rand_int[ + dtype: DType, size: Int +](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises: + with buff.map_to_host() as buff_host: + for i in range(size): + buff_host[i] = Int(random_float64(min, max)) + + +fn check_result[ + dtype: DType, size: Int, print_result: Bool = False +](actual: DeviceBuffer[dtype], expected: HostBuffer[dtype]) raises: + with actual.map_to_host() as actual_host: + if print_result: + print("=== RESULT ===") + print("actual:", actual_host) + print("expected:", expected) + for i in range(size): + assert_equal(actual_host[i], expected[i]) + + @parameter @always_inline -fn benchmark_simple_warp_parameterized[test_size: Int](mut b: Bencher) raises: - @parameter - @always_inline - fn simple_warp_workflow(ctx: DeviceContext) raises: - alias test_layout = Layout.row_major(test_size) - alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1) +fn benchmark_simple_warp_parameterized[ + test_size: Int +](mut bencher: Bencher) raises: + alias n_warps = test_size // WARP_SIZE + alias in_layout = Layout.row_major(test_size) + alias out_layout = Layout.row_major(n_warps) + alias n_threads = WARP_SIZE + alias n_blocks = (ceildiv(test_size, n_threads), 1) - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) - b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + bench_ctx = DeviceContext() + + out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + expected = bench_ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) - with a.map_to_host() as a_host, b_buf.map_to_host() as b_host: - for i in range(test_size): - a_host[i] = i - b_host[i] = i + rand_int[dtype, test_size](a) + rand_int[dtype, test_size](b) + expected_output[dtype, n_warps](expected, a, b) - out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr()) - a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr()) + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) + @parameter + @always_inline + fn traditional_workflow(ctx: DeviceContext) raises: ctx.enqueue_function[ - simple_warp_dot_product[test_layout, out_layout, test_size] + simple_warp_dot_product[in_layout, out_layout, test_size] ]( out_tensor, a_tensor, b_tensor, - grid_dim=test_blocks, - block_dim=THREADS_PER_BLOCK, + grid_dim=n_blocks, + block_dim=n_threads, ) - keep(out.unsafe_ptr()) - keep(a.unsafe_ptr()) - keep(b_buf.unsafe_ptr()) - ctx.synchronize() - bench_ctx = DeviceContext() - b.iter_custom[simple_warp_workflow](bench_ctx) + bencher.iter_custom[traditional_workflow](bench_ctx) + check_result[dtype, n_warps](out, expected) + keep(out.unsafe_ptr()) + keep(a.unsafe_ptr()) + keep(b.unsafe_ptr()) + bench_ctx.synchronize() @parameter @always_inline fn benchmark_functional_warp_parameterized[ test_size: Int -](mut b: Bencher) raises: - @parameter - @always_inline - fn functional_warp_workflow(ctx: DeviceContext) raises: - alias test_layout = Layout.row_major(test_size) +](mut bencher: Bencher) raises: + alias n_warps = test_size // WARP_SIZE + alias in_layout = Layout.row_major(test_size) + alias out_layout = Layout.row_major(n_warps) - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) - b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + bench_ctx = DeviceContext() - with a.map_to_host() as a_host, b_buf.map_to_host() as b_host: - for i in range(test_size): - a_host[i] = i - b_host[i] = i + out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + expected = bench_ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) - a_tensor = LayoutTensor[mut=False, dtype, test_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[mut=False, dtype, test_layout]( - b_buf.unsafe_ptr() - ) - output_tensor = LayoutTensor[mut=True, dtype, Layout.row_major(1)]( - out.unsafe_ptr() - ) + rand_int[dtype, test_size](a) + rand_int[dtype, test_size](b) + expected_output[dtype, n_warps](expected, a, b) + + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) + @parameter + @always_inline + fn functional_warp_workflow(ctx: DeviceContext) raises: functional_warp_dot_product[ - test_layout, dtype, SIMD_WIDTH, 1, test_size - ](output_tensor, a_tensor, b_tensor, ctx) - keep(out.unsafe_ptr()) - keep(a.unsafe_ptr()) - keep(b_buf.unsafe_ptr()) - ctx.synchronize() + in_layout, out_layout, dtype, SIMD_WIDTH, 1, test_size + ](out_tensor, a_tensor, b_tensor, ctx) - bench_ctx = DeviceContext() - b.iter_custom[functional_warp_workflow](bench_ctx) + bencher.iter_custom[functional_warp_workflow](bench_ctx) + check_result[dtype, n_warps](out, expected) + keep(out.unsafe_ptr()) + keep(a.unsafe_ptr()) + keep(b.unsafe_ptr()) + bench_ctx.synchronize() @parameter @always_inline -fn benchmark_traditional_parameterized[test_size: Int](mut b: Bencher) raises: - @parameter - @always_inline - fn traditional_workflow(ctx: DeviceContext) raises: - alias test_layout = Layout.row_major(test_size) - alias test_blocks = (ceildiv(test_size, WARP_SIZE), 1) +fn benchmark_traditional_parameterized[ + test_size: Int +](mut bencher: Bencher) raises: + alias n_warps = test_size // WARP_SIZE + alias in_layout = Layout.row_major(test_size) + alias out_layout = Layout.row_major(n_warps) + alias n_blocks = (ceildiv(test_size, WARP_SIZE), 1) + + bench_ctx = DeviceContext() - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) - b_buf = ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + out = bench_ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + b = bench_ctx.enqueue_create_buffer[dtype](test_size).enqueue_fill(0) + expected = bench_ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) - with a.map_to_host() as a_host, b_buf.map_to_host() as b_host: - for i in range(test_size): - a_host[i] = i - b_host[i] = i + rand_int[dtype, test_size](a) + rand_int[dtype, test_size](b) + expected_output[dtype, n_warps](expected, a, b) - out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr()) - a_tensor = LayoutTensor[dtype, test_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[dtype, test_layout](b_buf.unsafe_ptr()) + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) + @parameter + @always_inline + fn traditional_workflow(ctx: DeviceContext) raises: ctx.enqueue_function[ - traditional_dot_product_p10_style[ - test_layout, out_layout, test_size - ] + traditional_dot_product_p12_style[in_layout, out_layout, test_size] ]( out_tensor, a_tensor, b_tensor, - grid_dim=test_blocks, + grid_dim=n_blocks, block_dim=THREADS_PER_BLOCK, ) - keep(out.unsafe_ptr()) - keep(a.unsafe_ptr()) - keep(b_buf.unsafe_ptr()) - ctx.synchronize() - bench_ctx = DeviceContext() - b.iter_custom[traditional_workflow](bench_ctx) + bencher.iter_custom[traditional_workflow](bench_ctx) + check_result[dtype, n_warps](out, expected) + keep(out.unsafe_ptr()) + keep(a.unsafe_ptr()) + keep(b.unsafe_ptr()) + bench_ctx.synchronize() def main(): - with DeviceContext() as ctx: - out = ctx.enqueue_create_buffer[dtype](1).enqueue_fill(0) - a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) - b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) - - with a.map_to_host() as a_host, b.map_to_host() as b_host: - for i in range(SIZE): - a_host[i] = i - b_host[i] = i - - out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr()) - a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) - b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) - + if argv()[1] != "--benchmark": print("SIZE:", SIZE) print("WARP_SIZE:", WARP_SIZE) print("SIMD_WIDTH:", SIMD_WIDTH) - if argv()[1] == "--traditional": - ctx.enqueue_function[ - traditional_dot_product_p10_style[in_layout, out_layout, SIZE] - ]( - out_tensor, - a_tensor, - b_tensor, - grid_dim=BLOCKS_PER_GRID, - block_dim=THREADS_PER_BLOCK, - ) - elif argv()[1] == "--kernel": - ctx.enqueue_function[ - simple_warp_dot_product[in_layout, out_layout, SIZE] - ]( - out_tensor, - a_tensor, - b_tensor, - grid_dim=BLOCKS_PER_GRID, - block_dim=THREADS_PER_BLOCK, - ) - - elif argv()[1] == "--functional": - functional_warp_dot_product[in_layout, dtype, SIMD_WIDTH, 1, SIZE]( - out_tensor, a_tensor, b_tensor, ctx - ) - - elif argv()[1] == "--benchmark": - print("-" * 80) - bench_config = BenchConfig(max_iters=100) - bench = Bench(bench_config.copy()) - - print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1") - bench.bench_function[ - benchmark_traditional_parameterized[WARP_SIZE] - ](BenchId("traditional_1x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[WARP_SIZE] - ](BenchId("simple_warp_1x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[WARP_SIZE] - ](BenchId("functional_warp_1x")) - - print("-" * 80) - print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4") - bench.bench_function[ - benchmark_traditional_parameterized[4 * WARP_SIZE] - ](BenchId("traditional_4x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[4 * WARP_SIZE] - ](BenchId("simple_warp_4x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[4 * WARP_SIZE] - ](BenchId("functional_warp_4x")) - - print("-" * 80) - print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32") - bench.bench_function[ - benchmark_traditional_parameterized[32 * WARP_SIZE] - ](BenchId("traditional_32x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[32 * WARP_SIZE] - ](BenchId("simple_warp_32x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[32 * WARP_SIZE] - ](BenchId("functional_warp_32x")) - - print("-" * 80) - print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256") - bench.bench_function[ - benchmark_traditional_parameterized[256 * WARP_SIZE] - ](BenchId("traditional_256x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[256 * WARP_SIZE] - ](BenchId("simple_warp_256x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[256 * WARP_SIZE] - ](BenchId("functional_warp_256x")) - - print("-" * 80) - print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048") - bench.bench_function[ - benchmark_traditional_parameterized[2048 * WARP_SIZE] - ](BenchId("traditional_2048x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[2048 * WARP_SIZE] - ](BenchId("simple_warp_2048x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[2048 * WARP_SIZE] - ](BenchId("functional_warp_2048x")) - - print("-" * 80) - print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)") - bench.bench_function[ - benchmark_traditional_parameterized[16384 * WARP_SIZE] - ](BenchId("traditional_16384x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[16384 * WARP_SIZE] - ](BenchId("simple_warp_16384x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[16384 * WARP_SIZE] - ](BenchId("functional_warp_16384x")) - - print("-" * 80) - print( - "Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)" + alias n_warps = SIZE // WARP_SIZE + with DeviceContext() as ctx: + out = ctx.enqueue_create_buffer[dtype](n_warps).enqueue_fill(0) + a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) + b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0) + expected = ctx.enqueue_create_host_buffer[dtype]( + n_warps + ).enqueue_fill(0) + + out_tensor = LayoutTensor[mut=True, dtype, out_layout]( + out.unsafe_ptr() ) - bench.bench_function[ - benchmark_traditional_parameterized[65536 * WARP_SIZE] - ](BenchId("traditional_65536x")) - bench.bench_function[ - benchmark_simple_warp_parameterized[65536 * WARP_SIZE] - ](BenchId("simple_warp_65536x")) - bench.bench_function[ - benchmark_functional_warp_parameterized[65536 * WARP_SIZE] - ](BenchId("functional_warp_65536x")) - - print(bench) - print("Benchmarks completed!") - print() - print("WARP OPERATIONS PERFORMANCE ANALYSIS:") - print( - " GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD" - " (WARP_SIZE=64)" - ) - print(" - 1 x WARP_SIZE: Single warp baseline") - print(" - 4 x WARP_SIZE: Few warps, warp overhead visible") - print(" - 32 x WARP_SIZE: Medium scale, warp benefits emerge") - print(" - 256 x WARP_SIZE: Large scale, dramatic warp advantages") - print( - " - 2048 x WARP_SIZE: Massive scale, warp operations dominate" - ) - print(" - 16384 x WARP_SIZE: Large scale (512K-1M elements)") - print(" - 65536 x WARP_SIZE: Massive scale (2M-4M elements)") - print( - " - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!" - ) - print() - print(" Expected Results at Large Scales:") - print(" • Traditional: Slower due to more barrier overhead") - print( - " • Warp operations: Faster, scale better with problem size" - ) - print(" • Memory bandwidth becomes the limiting factor") - return - - else: - print( - "Usage: --traditional | --kernel | --functional | --benchmark" - ) - return - - expected = ctx.enqueue_create_host_buffer[dtype](1).enqueue_fill(0) - ctx.synchronize() - - with a.map_to_host() as a_host, b.map_to_host() as b_host: - for i in range(SIZE): - expected[0] += a_host[i] * b_host[i] + a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr()) + b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr()) + + with a.map_to_host() as a_host, b.map_to_host() as b_host: + for i in range(SIZE): + a_host[i] = i + b_host[i] = i + + if argv()[1] == "--traditional": + ctx.enqueue_function[ + traditional_dot_product_p12_style[ + in_layout, out_layout, SIZE + ] + ]( + out_tensor, + a_tensor, + b_tensor, + grid_dim=BLOCKS_PER_GRID, + block_dim=THREADS_PER_BLOCK, + ) + elif argv()[1] == "--kernel": + ctx.enqueue_function[ + simple_warp_dot_product[in_layout, out_layout, SIZE] + ]( + out_tensor, + a_tensor, + b_tensor, + grid_dim=BLOCKS_PER_GRID, + block_dim=THREADS_PER_BLOCK, + ) + elif argv()[1] == "--functional": + functional_warp_dot_product[ + in_layout, out_layout, dtype, SIMD_WIDTH, 1, SIZE + ](out_tensor, a_tensor, b_tensor, ctx) + expected_output[dtype, n_warps](expected, a, b) + check_result[dtype, n_warps, True](out, expected) + ctx.synchronize() + elif argv()[1] == "--benchmark": + print("-" * 80) + bench_config = BenchConfig(max_iters=100, num_warmup_iters=1) + bench = Bench(bench_config.copy()) + + print("Testing SIZE=1 x WARP_SIZE, BLOCKS=1") + bench.bench_function[benchmark_traditional_parameterized[WARP_SIZE]]( + BenchId("traditional_1x") + ) + bench.bench_function[benchmark_simple_warp_parameterized[WARP_SIZE]]( + BenchId("simple_warp_1x") + ) + bench.bench_function[ + benchmark_functional_warp_parameterized[WARP_SIZE] + ](BenchId("functional_warp_1x")) + + print("-" * 80) + print("Testing SIZE=4 x WARP_SIZE, BLOCKS=4") + bench.bench_function[ + benchmark_traditional_parameterized[4 * WARP_SIZE] + ](BenchId("traditional_4x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[4 * WARP_SIZE] + ](BenchId("simple_warp_4x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[4 * WARP_SIZE] + ](BenchId("functional_warp_4x")) + + print("-" * 80) + print("Testing SIZE=32 x WARP_SIZE, BLOCKS=32") + bench.bench_function[ + benchmark_traditional_parameterized[32 * WARP_SIZE] + ](BenchId("traditional_32x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[32 * WARP_SIZE] + ](BenchId("simple_warp_32x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[32 * WARP_SIZE] + ](BenchId("functional_warp_32x")) + + print("-" * 80) + print("Testing SIZE=256 x WARP_SIZE, BLOCKS=256") + bench.bench_function[ + benchmark_traditional_parameterized[256 * WARP_SIZE] + ](BenchId("traditional_256x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[256 * WARP_SIZE] + ](BenchId("simple_warp_256x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[256 * WARP_SIZE] + ](BenchId("functional_warp_256x")) + + print("-" * 80) + print("Testing SIZE=2048 x WARP_SIZE, BLOCKS=2048") + bench.bench_function[ + benchmark_traditional_parameterized[2048 * WARP_SIZE] + ](BenchId("traditional_2048x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[2048 * WARP_SIZE] + ](BenchId("simple_warp_2048x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[2048 * WARP_SIZE] + ](BenchId("functional_warp_2048x")) + + print("-" * 80) + print("Testing SIZE=16384 x WARP_SIZE, BLOCKS=16384 (Large Scale)") + bench.bench_function[ + benchmark_traditional_parameterized[16384 * WARP_SIZE] + ](BenchId("traditional_16384x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[16384 * WARP_SIZE] + ](BenchId("simple_warp_16384x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[16384 * WARP_SIZE] + ](BenchId("functional_warp_16384x")) + + print("-" * 80) + print("Testing SIZE=65536 x WARP_SIZE, BLOCKS=65536 (Massive Scale)") + bench.bench_function[ + benchmark_traditional_parameterized[65536 * WARP_SIZE] + ](BenchId("traditional_65536x")) + bench.bench_function[ + benchmark_simple_warp_parameterized[65536 * WARP_SIZE] + ](BenchId("simple_warp_65536x")) + bench.bench_function[ + benchmark_functional_warp_parameterized[65536 * WARP_SIZE] + ](BenchId("functional_warp_65536x")) + + print(bench) + print("Benchmarks completed!") + print() + print("WARP OPERATIONS PERFORMANCE ANALYSIS:") + print( + " GPU Architecture: NVIDIA (WARP_SIZE=32) vs AMD (WARP_SIZE=64)" + ) + print(" - 1,...,256 x WARP_SIZE: Grid size too small to benchmark") + print(" - 2048 x WARP_SIZE: Warp primative benefits emerge") + print(" - 16384 x WARP_SIZE: Large scale (512K-1M elements)") + print(" - 65536 x WARP_SIZE: Massive scale (2M-4M elements)") + print(" - Note: AMD GPUs process 2 x elements per warp vs NVIDIA!") + print() + print(" Expected Results at Large Scales:") + print(" • Traditional: Slower due to more barrier overhead") + print(" • Warp operations: Faster, scale better with problem size") + print(" • Memory bandwidth becomes the limiting factor") + return - with out.map_to_host() as out_host: - print("=== RESULT ===") - print("out:", out_host[0]) - print("expected:", expected[0]) - assert_equal(out_host[0], expected[0]) + else: + print("Usage: --traditional | --kernel | --functional | --benchmark") + return