design change lhs/rhsTile to registerBlocking

libxsmm · Jan 28, 2025 · 1806177 · 1806177
1 parent bc90eb6
commit 1806177
Show file tree

Hide file tree

Showing 10 changed files with 144 additions and 135 deletions.
diff --git a/benchmarks/config/base/base.json b/benchmarks/config/base/base.json
@@ -36,13 +36,27 @@
       "flags": [ "-n", "100" ],
       "extensions": []
     },
-    "gemm_fp32_mlir_vector": {
+    "gemm_fp32_mlir_vector_avx512": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": {},
-      "flags": [ "-n", "100",  "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100",  "-run-args='--vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": []
     },
+    "gemm_fp32_mlir_vector_avx2": {
+      "type": "IR-GEN",
+      "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
+      "environment": {},
+      "flags": [ "-n", "100",  "-run-args='--vector-to-kernels --registerBlocking=4,16 '" ],
+      "extensions": ["(avx2)"]
+    },
+    "gemm_fp32_mlir_vector_sve": {
+      "type": "IR-GEN",
+      "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
+      "environment": {},
+      "flags": [ "-n", "100",  "-run-args='--vector-to-kernels --registerBlocking=4,32 '" ],
+      "extensions": ["(asimd)"]
+    },
     "gemm_bf16_dp2_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ],
@@ -64,13 +78,27 @@
       "flags": [ "-n", "100" ],
       "extensions": []
     },
-    "mlp_fp32_mlir_vector": {
+    "mlp_fp32_mlir_vector_avx512": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": {},
-      "flags": [ "-n", "100",  "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100",  "-run-args='--def-parallel --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": []
     },
+    "mlp_fp32_mlir_vector_avx2": {
+      "type": "IR-GEN",
+      "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
+      "environment": {},
+      "flags": [ "-n", "100",  "-run-args='--def-parallel --vector-to-kernels --registerBlocking=4,16 '" ],
+      "extensions": ["(avx2)" ]
+    },
+    "mlp_fp32_mlir_vector_sve": {
+      "type": "IR-GEN",
+      "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
+      "environment": {},
+      "flags": [ "-n", "100",  "-run-args='--def-parallel --vector-to-kernels --registerBlocking=4,32 '" ],
+      "extensions": ["(asimd)"]
+    },
     "mlp_bf16_dp2_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ],
@@ -99,7 +127,7 @@
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
       "environment": {},
-      "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ "(avx2|asimd)" ]
     },
     "fp32_3x1024_args_mlir": {
@@ -113,7 +141,7 @@
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
       "environment": {},
-      "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ "(avx2|asimd)" ]
     },
     "bf16_3x1024_const_mlir": {
@@ -144,7 +172,7 @@
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
       "environment": {},
-      "flags": [ "-n", "100", "-run-args='--def-parallel  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ "(avx2|asimd)" ]
     },
     "fp32_3x1024_args_mlir": {
@@ -158,7 +186,7 @@
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
       "environment": {},
-      "flags": [ "-n", "100", "-run-args=' --def-parallel  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args=' --def-parallel  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ "(avx2|asimd)" ]
     },
     "bf16_3x1024_const_mlir": {

diff --git a/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json b/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json
@@ -129,28 +129,28 @@
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     },
     "fp32_3x1024_omp_4_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     },
     "fp32_3x1024_omp_8_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     },
     "fp32_3x1024_omp_16_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     }
   }},
@@ -160,28 +160,28 @@
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     },
     "fp32_3x1024_omp_4_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     },
     "fp32_3x1024_omp_8_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     },
     "fp32_3x1024_omp_16_mlir": {
       "type": "IR-GEN",
       "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
       "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,32 '" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,16 '" ],
       "extensions": [ "(avx2)" ]
     }
   }},

diff --git a/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json
@@ -5,28 +5,28 @@
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     },
     "fp32_3x1024_omp_4_mlir": {
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     },
     "fp32_3x1024_omp_8_mlir": {
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     },
     "fp32_3x1024_omp_16_mlir": {
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     }
   }},
@@ -36,28 +36,28 @@
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     },
     "fp32_3x1024_omp_4_mlir": {
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     },
     "fp32_3x1024_omp_8_mlir": {
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     },
     "fp32_3x1024_omp_16_mlir": {
       "type": "MLIR",
       "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
       "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
-      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8  --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
+      "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8  --vector-to-kernels --registerBlocking=8,32 '" ],
       "extensions": [ ]
     }
   }}

diff --git a/include/TPP/PassBundles.td b/include/TPP/PassBundles.td
@@ -47,10 +47,8 @@ def DefaultTppPasses : Pass<"default-tpp-passes", "ModuleOp"> {
     Option<"lowerPackUnpackWithoutTranspose", "lower-pack-unpack-without-transpose",
            "bool", /*default=*/"false",
            "Lower non-constant packs and unpacks reverting any dim permutations.">,
-    ListOption<"lhsTile", "lhsTile",
-           "unsigned", "Lhs tile size for brgemm operation.">,
-    ListOption<"rhsTile", "rhsTile",
-           "unsigned", "Rhs tile size for brgemm operation.">,
+    ListOption<"registerBlocking", "registerBlocking",
+           "unsigned", "Register blocking tile sizes for brgemm operation.">,
 
   ];
 }

diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -92,14 +92,13 @@ def VectorContractToFMA : Pass<
 def BrgemmLinalgTiling : Pass<"tile-brgemm-linalg"> {
   let summary = "Tile bregmm  matmul and reduction dimension.";
   let description = [{
-    Tiles the innermost dimensions of the batch reduce matmul operation. Additionally, it swaps the reduction and k dimension loop. The final loop structure is as follows: M-loop->N-loop->reduction-loop->K-loop. For example: --tile-brgemm-linalg="lhsTile=8,8 rhsTile=8,16".
+    Tiles the innermost dimensions of the batch reduce matmul operation to support perfect register allocation. Additionally, it swaps the reduction and k dimension loop. The final loop structure is as follows: M-loop->N-loop->reduction-loop->K-loop. For example: --tile-brgemm-linalg="registerBlocking=<mTileSize>, <nTileSize>".
   }];
   let dependentDialects = ["linalg::LinalgDialect",
                            "memref::MemRefDialect",
                            "arith::ArithDialect"];
   let options = [
-         ListOption<"mTileShape", "lhsTile", "unsigned", "Input for the tile shape of m x k dim. Tile size should not be greater than the dimension size. Example: lhsTile=8,8">,
-         ListOption<"nTileShape", "rhsTile", "unsigned", "Input for the tile shape of k x n dim. Tile size should not be greater than the dimension size. Example: rhsTile=8,16">,
+         ListOption<"registerTileShape", "registerBlocking", "unsigned", "Input for the register blocking tile shapes for a brgemm operation">,
   ];
 }
 

diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp
@@ -66,18 +66,13 @@ llvm::cl::opt<bool> lowerPackUnpackWithoutTranspose(
     llvm::cl::desc("Lower packs and unpacks reverting any dim permutations"),
     llvm::cl::init(false));
 
-// Lhs tile sizes for linalg-to-vector.
-llvm::cl::list<unsigned>
-    lhsTile("lhsTile", llvm::cl::desc("Lhs tile size for brgemm operation"),
-            llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 8}),
-            llvm::cl::CommaSeparated);
 
-// Rhs tile sizes for linalg-to-vector
 llvm::cl::list<unsigned>
-    rhsTile("rhsTile", llvm::cl::desc("Rhs tile size for brgemm operation"),
-            llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 16}),
+    registerBlocking("registerBlocking", llvm::cl::desc("Register blocking tile sizes for brgemm operation"),
+            llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 32}),
             llvm::cl::CommaSeparated);
 
+
 llvm::cl::opt<bool> vectorToXSMM("vector-to-XSMM",
                                  llvm::cl::desc("Lower vector to XSMM"),
                                  llvm::cl::init(false));
@@ -160,10 +155,8 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
       tppDefaultOptions.linalgToVector = linalgToVector;
       tppDefaultOptions.vectorToXSMM = vectorToXSMM;
       tppDefaultOptions.lowerPackUnpackWithoutTranspose = lowerPackUnpackWithoutTranspose;
-      tppDefaultOptions.lhsTile =
-          SmallVector<unsigned>{lhsTile.begin(), lhsTile.end()};
-      tppDefaultOptions.rhsTile =
-          SmallVector<unsigned>{rhsTile.begin(), rhsTile.end()};
+      tppDefaultOptions.registerBlocking =
+          SmallVector<unsigned>{registerBlocking.begin(), registerBlocking.end()};
       tppDefaultOptions.vectorToKernel = vectorToKernel;
 
       pm.addPass(createDefaultTppPasses(tppDefaultOptions));

diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp
@@ -139,8 +139,7 @@ struct DefaultTppPasses
       if (linalgToVector || forceLinalgToVector) {
         // Vectorizes the remaining Linalg operations
         pm.addNestedPass<func::FuncOp>(createBrgemmLinalgTiling(
-            BrgemmLinalgTilingOptions{SmallVector<unsigned>{*lhsTile},
-                                      SmallVector<unsigned>{*rhsTile}}));
+            BrgemmLinalgTilingOptions{SmallVector<unsigned>{*registerBlocking}}));
         pm.addNestedPass<func::FuncOp>(createLoopInvariantCodeMotionPass());
         pm.addNestedPass<func::FuncOp>(createVectorizationPass());