diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 6d02d3e7e7..50b38421d1 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -156,11 +156,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col = by * TILE_DIM + tx; // Matrix column index + int row = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row < N_c && col < N_r) { Atview(col, row) = Tile[ty][tx]; } } diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index d0101ca12e..8d1d7ddfeb 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -81,6 +81,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) constexpr int N_c = 251; constexpr int TILE_DIM = 16; + constexpr int BLOCK_ROWS = 8; constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; @@ -153,15 +154,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Note: loop order is swapped from above so that output matrix // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -191,6 +193,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using TILE_MEM = RAJA::LocalArray, RAJA::SizeList>; TILE_MEM Tile_Array; + using THREAD_ROWS = RAJA::TypedRangeSegment; + using RAJA::Params; + using RAJA::Segs; // _mattranspose_localarray_end // **NOTE** Although the LocalArray is constructed @@ -209,18 +214,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::InitLocalMem, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, RAJA::statement::Lambda<0> > >, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec, RAJA::statement::Lambda<1> > > - > > > @@ -273,8 +277,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // loops. These loops copy data from the global matrices // to the local tile. // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, RAJA::statement::Lambda<0> > >, @@ -286,8 +290,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // swapped! This enables us to swap which // index has unit stride. // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec, RAJA::statement::Lambda<1> > > @@ -302,13 +306,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::make_tuple((int)0, (int)0, Tile_Array), [=](int col, int row, int tx, int ty, TILE_MEM &_Tile_Array) { - _Tile_Array(ty, tx) = Aview(row, col); }, [=](int col, int row, int tx, int ty, TILE_MEM &_Tile_Array) { - Atview(col, row) = _Tile_Array(ty, tx); } @@ -401,21 +403,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::TileTCount<1, RAJA::statement::Param<1>, + RAJA::tile_fixed, RAJA::cuda_block_y_direct_unchecked, + RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, + RAJA::tile_fixed, RAJA::cuda_block_x_direct_unchecked, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, + RAJA::statement::InitLocalMem, // // (1) Execution policies for the first set of inner // loops. These loops copy data from the global matrices // to the local tile. // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<0> + RAJA::statement::For<2, RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0, Segs<0, 2>, Params<0, 1, 2, 3>> > >, // Synchronize threads to ensure all loads @@ -429,14 +433,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // swapped! This enables us to swap which // index has unit stride. // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, - RAJA::statement::Lambda<1> + RAJA::statement::For<2, RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<1, Segs<0, 2>, Params<0, 1, 2, 3>> > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::CudaSyncThreads + > > > > @@ -446,18 +447,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = Aview(row, col); + RAJA::TypedRangeSegment(0, N_r), + THREAD_ROWS(0, BLOCK_ROWS)), + RAJA::make_tuple((int)0, (int)0, (int)0, Tile_Array), + + [=] RAJA_DEVICE (int col, int ty, int bx, int by, int tx, TILE_MEM &Tile_Array) { + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row = by * TILE_DIM + ty + j; + if (row < N_r) { + Tile_Array(ty + j, tx) = Aview(row, col); + } + } }, - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Atview(col, row) = Tile_Array(ty, tx); + [=] RAJA_DEVICE(int RAJA_UNUSED_ARG(col), int ty, int bx, int by, int tx, + TILE_MEM &Tile_Array) { + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row_t = bx * TILE_DIM + ty + j; + int col_t = by * TILE_DIM + tx; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile_Array(tx, ty + j); + } + } } ); @@ -496,21 +508,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::TileTCount<1, RAJA::statement::Param<1>, + RAJA::tile_fixed, RAJA::hip_block_y_direct_unchecked, + RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, + RAJA::tile_fixed, RAJA::hip_block_x_direct_unchecked, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList // identify RAJA local arrays to intialize in the parameter tuple. - RAJA::statement::InitLocalMem, + RAJA::statement::InitLocalMem, // // (1) Execution policies for the first set of inner // loops. These loops copy data from the global matrices // to the local tile. // - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct, - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<0> + RAJA::statement::For<2, RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0, Segs<0, 2>, Params<0, 1, 2, 3>> > >, // Synchronize threads to ensure all loads @@ -524,14 +538,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // swapped! This enables us to swap which // index has unit stride. // - RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct, - RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, - RAJA::statement::Lambda<1> + RAJA::statement::For<2, RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<1, Segs<0, 2>, Params<0, 1, 2, 3>> > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::HipSyncThreads + > > > > @@ -541,23 +552,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::kernel_param( RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), - RAJA::TypedRangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), - - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - Tile_Array(ty, tx) = d_Aview(row, col); + RAJA::TypedRangeSegment(0, N_r), + THREAD_ROWS(0, BLOCK_ROWS)), + RAJA::make_tuple((int)0, (int)0, (int)0, Tile_Array), + + [=] RAJA_DEVICE (int col, int ty, int bx, int by, int tx, TILE_MEM &Tile_Array) { + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row = by * TILE_DIM + ty + j; + if (row < N_r) { + Tile_Array(ty + j, tx) = d_Aview(row, col); + } + } }, - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - - d_Atview(col, row) = Tile_Array(ty, tx); + [=] RAJA_DEVICE(int RAJA_UNUSED_ARG(col), int ty, int bx, int by, int tx, + TILE_MEM &Tile_Array) { + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row_t = bx * TILE_DIM + ty + j; + int col_t = by * TILE_DIM + tx; + if (row_t < N_c && col_t < N_r) { + d_Atview(row_t, col_t) = Tile_Array(tx, ty + j); + } + } } ); CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost); + CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif @@ -573,7 +596,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using RAJA::Offsets; using RAJA::Params; - // _raja_mattranspose_lambdaargs_start + // _mattranspose_localarray_raja_lambdaargs_start using SEQ_EXEC_POL_II = RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, @@ -612,7 +635,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) Atview(col, row) = _Tile_Array(ty, tx); } ); - // _raja_mattranspose_lambdaargs_start + // _mattranspose_localarray_raja_lambdaargs_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index 229e28a6b7..6421785945 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -16,9 +16,9 @@ #include "memoryManager.hpp" /* - * Matrix Transpose Example + * Matrix Transpose Exercise * - * In this example, an input matrix A of dimension N_r x N_c is + * In this exercise, an input matrix A of dimension N_r x N_c is * transposed and returned as a second matrix At of size N_c x N_r. * * This operation is carried out using a local memory tiling @@ -32,7 +32,7 @@ * data into the tile; while outer loops will iterate over the number * of tiles needed to carry out the transpose. * - * RAJA variants of the example use RAJA_TEAM_SHARED as tile memory. + * RAJA variants of the exercise use RAJA_TEAM_SHARED as tile memory. * Furthermore, the tiling pattern is handled by RAJA's tile methods. * For CPU execution, RAJA_TEAM_SHARED are used to improve * performance via cache blocking. For CUDA GPU execution, @@ -70,7 +70,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nRAJA shared matrix transpose example...\n"; + std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; // // Define num rows/cols in matrix, tile dimensions, and number of tiles @@ -147,18 +147,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (2) Inner loops to write array data into output array tile // - // Note: loop order is swapped from above so that output matrix - // data access is stride-1. // for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -172,7 +171,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -197,10 +196,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// input matrix into the RAJA_TEAM_SHARED memory array /// - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -217,7 +216,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix " - "transpose example ...\n"; + "transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -246,18 +245,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - Tile_Array[ty][tx] = Aview(row, col); + Tile_Array[ty][tx] = Aview(row, col); }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -273,7 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n"; + std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -309,10 +308,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); - Atview(col, row) = Tile_Array[ty][tx]; + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { + + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -330,7 +331,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; + std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; int *d_A = memoryManager::allocate_gpu(N_r * N_c); int *d_At = memoryManager::allocate_gpu(N_r * N_c); @@ -381,10 +382,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); + + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - d_Atview(col, row) = Tile_Array[ty][tx]; + d_Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index 0d407c45bd..a4131aa4c0 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -16,9 +16,9 @@ #include "memoryManager.hpp" /* - * Matrix Transpose Example + * Matrix Transpose Exercise * - * In this example, an input matrix A of dimension N_r x N_c is + * In this exercise, an input matrix A of dimension N_r x N_c is * transposed and returned as a second matrix At of size N_c x N_r. * * This operation is carried out using a local memory tiling @@ -32,7 +32,7 @@ * data into the tile; while outer loops will iterate over the number * of tiles needed to carry out the transpose. * - * RAJA variants of the example use RAJA_TEAM_SHARED as tile memory. + * RAJA variants of the exercise use RAJA_TEAM_SHARED as tile memory. * Furthermore, the tiling pattern is handled by RAJA's tile methods. * For CPU execution, RAJA_TEAM_SHARED are used to improve * performance via cache blocking. For CUDA GPU execution, @@ -53,6 +53,7 @@ // const int DIM = 2; #define TILE_DIM (16) // #define to appease msvc +#define BLOCK_ROWS (8) // // Function for checking results @@ -70,7 +71,7 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nRAJA shared matrix transpose example...\n"; + std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; // // Define num rows/cols in matrix, tile dimensions, and number of tiles @@ -147,18 +148,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (2) Inner loops to write array data into output array tile // - // Note: loop order is swapped from above so that output matrix - // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -172,7 +172,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -184,11 +184,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM]; RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { @@ -198,10 +200,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -218,7 +220,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix " - "transpose example ...\n"; + "transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -234,11 +236,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM]; RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { @@ -248,10 +252,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -267,17 +271,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n"; + std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); constexpr int c_block_sz = TILE_DIM; - constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + constexpr int r_block_sz = BLOCK_ROWS; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, TILE_DIM); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, TILE_DIM); - using cuda_teams_y = RAJA::LoopPolicy; - using cuda_teams_x = RAJA::LoopPolicy; + using cuda_teams_y = RAJA::LoopPolicy; + using cuda_teams_x = RAJA::LoopPolicy; using cuda_threads_y = RAJA::LoopPolicy; using cuda_threads_x = RAJA::LoopPolicy; @@ -287,33 +291,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), + RAJA::Threads(c_block_sz, r_block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM]; - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, RAJA::TypedRangeSegment(0, BLOCK_ROWS), [&] (int row_base, int ty) { RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = Aview(row, col); + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row = (*row_tile.begin()) + row_base + j; + if (row < N_r) { + Tile_Array[ty + j][tx] = Aview(row, col); + } + } }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); - Atview(col, row) = Tile_Array[ty][tx]; + RAJA::loop_icount(ctx, RAJA::TypedRangeSegment(0, BLOCK_ROWS), [&] (int row_base, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int RAJA_UNUSED_ARG(col), int tx) { + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row_t = (*col_tile.begin()) + row_base + j; + int col_t = (*row_tile.begin()) + tx; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile_Array[tx][ty + j]; + } + } - }); - }); + }); + }); - }); - }); + }); + }); }); @@ -325,7 +342,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; + std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; int *d_A = memoryManager::allocate_gpu(N_r * N_c); int *d_At = memoryManager::allocate_gpu(N_r * N_c); @@ -344,12 +361,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice); constexpr int c_block_sz = TILE_DIM; - constexpr int r_block_sz = TILE_DIM; - const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); - const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + constexpr int r_block_sz = BLOCK_ROWS; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, TILE_DIM); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, TILE_DIM); - using hip_teams_y = RAJA::LoopPolicy; - using hip_teams_x = RAJA::LoopPolicy; + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; @@ -359,37 +376,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), + RAJA::Threads(c_block_sz, r_block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { - RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM]; - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, RAJA::TypedRangeSegment(0, BLOCK_ROWS), [&] (int row_base, int ty) { RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - - Tile_Array[ty][tx] = d_Aview(row, col); + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row = (*row_tile.begin()) + row_base + j; + if (row < N_r) { + Tile_Array[ty + j][tx] = d_Aview(row, col); + } + } }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); - d_Atview(col, row) = Tile_Array[ty][tx]; + RAJA::loop_icount(ctx, RAJA::TypedRangeSegment(0, BLOCK_ROWS), [&] (int row_base, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int RAJA_UNUSED_ARG(col), int tx) { + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + int row_t = (*col_tile.begin()) + row_base + j; + int col_t = (*row_tile.begin()) + tx; + if (row_t < N_c && col_t < N_r) { + d_Atview(row_t, col_t) = Tile_Array[tx][ty + j]; + } + } - }); - }); + }); + }); - }); - }); + }); + }); }); CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost); + CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif