diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
index 6d02d3e7e7..50b38421d1 100644
--- a/exercises/kernel-matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -156,11 +156,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       for (int tx = 0; tx < TILE_DIM; ++tx) {
         for (int ty = 0; ty < TILE_DIM; ++ty) {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          // Tranpose tile offset
+          int col = by * TILE_DIM + tx;  // Matrix column index
+          int row = bx * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
+          if (row < N_c && col < N_r) {
             Atview(col, row) = Tile[ty][tx];
           }
         }
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
index d0101ca12e..8d1d7ddfeb 100644
--- a/exercises/kernel-matrix-transpose-local-array_solution.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -81,6 +81,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   constexpr int N_c = 251;
 
   constexpr int TILE_DIM = 16;
+  constexpr int BLOCK_ROWS = 8;
 
   constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
   constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
@@ -153,15 +154,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //     Note: loop order is swapped from above so that output matrix
       //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          // Tranpose tile offset
+          int col_t = by * TILE_DIM + tx;  // Matrix column index
+          int row_t = bx * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
-            Atview(col, row) = Tile[ty][tx];
+          if (row_t < N_c && col_t < N_r) {
+            Atview(row_t, col_t) = Tile[tx][ty];
           }
         }
       }
@@ -191,6 +193,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using TILE_MEM =
     RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
   TILE_MEM Tile_Array;
+  using THREAD_ROWS = RAJA::TypedRangeSegment<int>;
+  using RAJA::Params;
+  using RAJA::Segs;
   // _mattranspose_localarray_end
 
   // **NOTE** Although the LocalArray is constructed
@@ -209,18 +214,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
           RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
 
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
               RAJA::statement::Lambda<0>
             >
           >,
 
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
               RAJA::statement::Lambda<1>
             >
           >
-
           >
         >
       >
@@ -273,8 +277,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           // loops. These loops copy data from the global matrices
           // to the local tile.
           //
-          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
                                        RAJA::statement::Lambda<0>
             >
           >,
@@ -286,8 +290,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           //     swapped! This enables us to swap which
           //     index has unit stride.
           //
-          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::seq_exec,
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::seq_exec,
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::seq_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::seq_exec,
                                        RAJA::statement::Lambda<1>
             >
           >
@@ -302,13 +306,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
     [=](int col, int row, int tx, int ty, TILE_MEM &_Tile_Array) {
-
       _Tile_Array(ty, tx) = Aview(row, col);
 
     },
 
     [=](int col, int row, int tx, int ty, TILE_MEM &_Tile_Array) {
-
       Atview(col, row) = _Tile_Array(ty, tx);
 
     }
@@ -401,21 +403,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+      RAJA::statement::TileTCount<1, RAJA::statement::Param<1>,
+        RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_direct_unchecked,
+        RAJA::statement::TileTCount<0, RAJA::statement::Param<0>,
+          RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_direct_unchecked,
           // This statement will initalize local array memory inside a
           // kernel. The cpu_tile_mem policy specifies that memory should be
           // allocated on the stack. The entries in the RAJA::ParamList
           // identify RAJA local arrays to intialize in the parameter tuple.
-          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
+          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<3>,
             //
             // (1) Execution policies for the first set of inner
             // loops. These loops copy data from the global matrices
             // to the local tile.
             //
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct,
-                                          RAJA::statement::Lambda<0>
+            RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::cuda_thread_x_direct,
+                                          RAJA::statement::Lambda<0, Segs<0, 2>, Params<0, 1, 2, 3>>
               >
             >,
             // Synchronize threads to ensure all loads
@@ -429,14 +433,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             //     swapped! This enables us to swap which
             //     index has unit stride.
             //
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
-                                            RAJA::statement::Lambda<1>
+            RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::cuda_thread_x_direct,
+                                            RAJA::statement::Lambda<1, Segs<0, 2>, Params<0, 1, 2, 3>>
               >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::CudaSyncThreads
+            >
           >
         >
       >
@@ -446,18 +447,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = Aview(row, col);
+                     RAJA::TypedRangeSegment<int>(0, N_r),
+                     THREAD_ROWS(0, BLOCK_ROWS)),
+    RAJA::make_tuple((int)0, (int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int ty, int bx, int by, int tx, TILE_MEM &Tile_Array) {
+      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+        int row = by * TILE_DIM + ty + j;
+        if (row < N_r) {
+          Tile_Array(ty + j, tx) = Aview(row, col);
+        }
+      }
 
     },
 
-    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Atview(col, row) = Tile_Array(ty, tx);
+    [=] RAJA_DEVICE(int RAJA_UNUSED_ARG(col), int ty, int bx, int by, int tx,
+                    TILE_MEM &Tile_Array) {
+      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+        int row_t = bx * TILE_DIM + ty + j;
+        int col_t = by * TILE_DIM + tx;
+        if (row_t < N_c && col_t < N_r) {
+          Atview(row_t, col_t) = Tile_Array(tx, ty + j);
+        }
+      }
 
     }
   );
@@ -496,21 +508,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+      RAJA::statement::TileTCount<1, RAJA::statement::Param<1>,
+        RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_direct_unchecked,
+        RAJA::statement::TileTCount<0, RAJA::statement::Param<0>,
+          RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_direct_unchecked,
           // This statement will initalize local array memory inside a
           // kernel. The cpu_tile_mem policy specifies that memory should be
           // allocated on the stack. The entries in the RAJA::ParamList
           // identify RAJA local arrays to intialize in the parameter tuple.
-          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
+          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<3>,
             //
             // (1) Execution policies for the first set of inner
             // loops. These loops copy data from the global matrices
             // to the local tile.
             //
-            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
-              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct,
-                                          RAJA::statement::Lambda<0>
+            RAJA::statement::For<2, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::hip_thread_x_direct,
+                                          RAJA::statement::Lambda<0, Segs<0, 2>, Params<0, 1, 2, 3>>
               >
             >,
             // Synchronize threads to ensure all loads
@@ -524,14 +538,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             //     swapped! This enables us to swap which
             //     index has unit stride.
             //
-            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
-              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
-                                            RAJA::statement::Lambda<1>
+            RAJA::statement::For<2, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<2>, RAJA::hip_thread_x_direct,
+                                            RAJA::statement::Lambda<1, Segs<0, 2>, Params<0, 1, 2, 3>>
               >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::HipSyncThreads
+            >
           >
         >
       >
@@ -541,23 +552,35 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::kernel_param<HIP_EXEC_POL>(
     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
-                     RAJA::TypedRangeSegment<int>(0, N_r)),
-    RAJA::make_tuple((int)0, (int)0, Tile_Array),
-
-    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      Tile_Array(ty, tx) = d_Aview(row, col);
+                     RAJA::TypedRangeSegment<int>(0, N_r),
+                     THREAD_ROWS(0, BLOCK_ROWS)),
+    RAJA::make_tuple((int)0, (int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int ty, int bx, int by, int tx, TILE_MEM &Tile_Array) {
+      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+        int row = by * TILE_DIM + ty + j;
+        if (row < N_r) {
+          Tile_Array(ty + j, tx) = d_Aview(row, col);
+        }
+      }
 
     },
 
-    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-
-      d_Atview(col, row) = Tile_Array(ty, tx);
+    [=] RAJA_DEVICE(int RAJA_UNUSED_ARG(col), int ty, int bx, int by, int tx,
+                    TILE_MEM &Tile_Array) {
+      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+        int row_t = bx * TILE_DIM + ty + j;
+        int col_t = by * TILE_DIM + tx;
+        if (row_t < N_c && col_t < N_r) {
+          d_Atview(row_t, col_t) = Tile_Array(tx, ty + j);
+        }
+      }
 
     }
   );
 
   CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
+  CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
@@ -573,7 +596,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   using RAJA::Offsets;
   using RAJA::Params;
 
-  // _raja_mattranspose_lambdaargs_start
+  // _mattranspose_localarray_raja_lambdaargs_start
   using SEQ_EXEC_POL_II =
     RAJA::KernelPolicy<
       RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
@@ -612,7 +635,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       Atview(col, row) = _Tile_Array(ty, tx);
     }
   );
-  // _raja_mattranspose_lambdaargs_start
+  // _mattranspose_localarray_raja_lambdaargs_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
index 229e28a6b7..6421785945 100644
--- a/exercises/launch-matrix-transpose-local-array.cpp
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -16,9 +16,9 @@
 #include "memoryManager.hpp"
 
 /*
- *  Matrix Transpose Example
+ *  Matrix Transpose Exercise
  *
- *  In this example, an input matrix A of dimension N_r x N_c is
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
  *  transposed and returned as a second matrix At of size N_c x N_r.
  *
  *  This operation is carried out using a local memory tiling
@@ -32,7 +32,7 @@
  *  data into the tile; while outer loops will iterate over the number
  *  of tiles needed to carry out the transpose.
  *
- *  RAJA variants of the example use RAJA_TEAM_SHARED as tile memory.
+ *  RAJA variants of the exercise use RAJA_TEAM_SHARED as tile memory.
  *  Furthermore, the tiling pattern is handled by RAJA's tile methods.
  *  For CPU execution, RAJA_TEAM_SHARED are used to improve
  *  performance via cache blocking. For CUDA GPU execution,
@@ -70,7 +70,7 @@ void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nRAJA shared matrix transpose example...\n";
+  std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
 
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
@@ -147,18 +147,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //
       // (2) Inner loops to write array data into output array tile
       //
-      //     Note: loop order is swapped from above so that output matrix
-      //           data access is stride-1.
       //
       for (int tx = 0; tx < TILE_DIM; ++tx) {
         for (int ty = 0; ty < TILE_DIM; ++ty) {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          // Tranpose tile offset
+          int col_t = by * TILE_DIM + tx;  // Matrix column index
+          int row_t = bx * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
-            Atview(col, row) = Tile[ty][tx];
+          if (row_t < N_c && col_t < N_r) {
+            Atview(row_t, col_t) = Tile[tx][ty];
           }
         }
       }
@@ -172,7 +171,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -197,10 +196,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           /// input matrix into the RAJA_TEAM_SHARED memory array
           ///
 
-          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row_t, int ty) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col_t, int tx) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row_t, col_t) = Tile_Array[tx][ty];
 
             });
           });
@@ -217,7 +216,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_OPENMP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix "
-               "transpose example ...\n";
+               "transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -246,18 +245,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
             RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
-            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
-              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+          RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
 
-                Tile_Array[ty][tx] = Aview(row, col);
+              Tile_Array[ty][tx] = Aview(row, col);
 
               });
             });
 
-            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row_t, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col_t, int tx) {
 
-                Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row_t, col_t) = Tile_Array[tx][ty];
 
                 });
               });
@@ -273,7 +272,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //--------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
-  std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -309,10 +308,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
 
-              Atview(col, row) = Tile_Array[ty][tx];
+          RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row_t, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col_t, int tx) {
+
+              Atview(row_t, col_t) = Tile_Array[tx][ty];
 
             });
           });
@@ -330,7 +331,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
   int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
@@ -381,10 +382,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
+
+          RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row_t, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col_t, int tx) {
 
-              d_Atview(col, row) = Tile_Array[ty][tx];
+              d_Atview(row_t, col_t) = Tile_Array[tx][ty];
 
             });
           });
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
index 0d407c45bd..a4131aa4c0 100644
--- a/exercises/launch-matrix-transpose-local-array_solution.cpp
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -16,9 +16,9 @@
 #include "memoryManager.hpp"
 
 /*
- *  Matrix Transpose Example
+ *  Matrix Transpose Exercise
  *
- *  In this example, an input matrix A of dimension N_r x N_c is
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
  *  transposed and returned as a second matrix At of size N_c x N_r.
  *
  *  This operation is carried out using a local memory tiling
@@ -32,7 +32,7 @@
  *  data into the tile; while outer loops will iterate over the number
  *  of tiles needed to carry out the transpose.
  *
- *  RAJA variants of the example use RAJA_TEAM_SHARED as tile memory.
+ *  RAJA variants of the exercise use RAJA_TEAM_SHARED as tile memory.
  *  Furthermore, the tiling pattern is handled by RAJA's tile methods.
  *  For CPU execution, RAJA_TEAM_SHARED are used to improve
  *  performance via cache blocking. For CUDA GPU execution,
@@ -53,6 +53,7 @@
 //
 const int DIM = 2;
 #define TILE_DIM (16)  // #define to appease msvc
+#define BLOCK_ROWS (8)
 
 //
 // Function for checking results
@@ -70,7 +71,7 @@ void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nRAJA shared matrix transpose example...\n";
+  std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
 
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
@@ -147,18 +148,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //
       // (2) Inner loops to write array data into output array tile
       //
-      //     Note: loop order is swapped from above so that output matrix
-      //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          // Tranpose tile offset
+          int col_t = by * TILE_DIM + tx;  // Matrix column index
+          int row_t = bx * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
-            Atview(col, row) = Tile[ty][tx];
+          if (row_t < N_c && col_t < N_r) {
+            Atview(row_t, col_t) = Tile[tx][ty];
           }
         }
       }
@@ -172,7 +172,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -184,11 +184,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+          [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+          RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM];
 
           RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
             RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
@@ -198,10 +200,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row_t, int ty) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col_t, int tx) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row_t, col_t) = Tile_Array[tx][ty];
 
             });
           });
@@ -218,7 +220,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_OPENMP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix "
-               "transpose example ...\n";
+               "transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -234,11 +236,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+        [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+          RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM];
 
           RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
             RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
@@ -248,10 +252,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row_t, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col_t, int tx) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row_t, col_t) = Tile_Array[tx][ty];
 
             });
           });
@@ -267,17 +271,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //--------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
-  std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
   constexpr int c_block_sz = TILE_DIM;
-  constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  constexpr int r_block_sz = BLOCK_ROWS;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, TILE_DIM);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, TILE_DIM);
 
-  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
-  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct_unchecked>;
+  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>;
 
   using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
   using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
@@ -287,33 +291,46 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<cuda_launch_policy>(
     RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
+                       RAJA::Threads(c_block_sz, r_block_sz)),
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+        [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+          RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM];
 
-          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<cuda_threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, BLOCK_ROWS), [&] (int row_base, int ty) {
             RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = Aview(row, col);
+              for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+                int row = (*row_tile.begin()) + row_base + j;
+                if (row < N_r) {
+                  Tile_Array[ty + j][tx] = Aview(row, col);
+                }
+              }
 
             });
           });
 
-         RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-           RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
 
-             Atview(col, row) = Tile_Array[ty][tx];
+          RAJA::loop_icount<cuda_threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, BLOCK_ROWS), [&] (int row_base, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int RAJA_UNUSED_ARG(col), int tx) {
+              for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+                int row_t = (*col_tile.begin()) + row_base + j;
+                int col_t = (*row_tile.begin()) + tx;
+                if (row_t < N_c && col_t < N_r) {
+                  Atview(row_t, col_t) = Tile_Array[tx][ty + j];
+                }
+              }
 
-           });
-         });
+            });
+          });
 
-       });
-     });
+        });
+      });
 
    });
 
@@ -325,7 +342,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
   int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
@@ -344,12 +361,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice);
 
   constexpr int c_block_sz = TILE_DIM;
-  constexpr int r_block_sz = TILE_DIM;
-  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
-  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  constexpr int r_block_sz = BLOCK_ROWS;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, TILE_DIM);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, TILE_DIM);
 
-  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
-  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct_unchecked>;
+  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>;
 
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
@@ -359,37 +376,51 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<hip_launch_policy>
      (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+        [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
-          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+          RAJA_TEAM_SHARED int Tile_Array[TILE_DIM][TILE_DIM];
 
-          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<hip_threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, BLOCK_ROWS), [&] (int row_base, int ty) {
             RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-
-              Tile_Array[ty][tx] = d_Aview(row, col);
+              for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+                int row = (*row_tile.begin()) + row_base + j;
+                if (row < N_r) {
+                  Tile_Array[ty + j][tx] = d_Aview(row, col);
+                }
+              }
 
             });
           });
 
-          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-           RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
 
-             d_Atview(col, row) = Tile_Array[ty][tx];
+          RAJA::loop_icount<hip_threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, BLOCK_ROWS), [&] (int row_base, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int RAJA_UNUSED_ARG(col), int tx) {
+              for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+                int row_t = (*col_tile.begin()) + row_base + j;
+                int col_t = (*row_tile.begin()) + tx;
+                if (row_t < N_c && col_t < N_r) {
+                  d_Atview(row_t, col_t) = Tile_Array[tx][ty + j];
+                }
+              }
 
-           });
-         });
+            });
+          });
 
-       });
-     });
+        });
+      });
 
    });
 
   CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
+  CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif