Skip to content

Commit

Permalink
design change lhs/rhsTile to registerBlocking
Browse files Browse the repository at this point in the history
  • Loading branch information
Arun Thangamani committed Jan 28, 2025
1 parent bc90eb6 commit 1806177
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 135 deletions.
44 changes: 36 additions & 8 deletions benchmarks/config/base/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,27 @@
"flags": [ "-n", "100" ],
"extensions": []
},
"gemm_fp32_mlir_vector": {
"gemm_fp32_mlir_vector_avx512": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": []
},
"gemm_fp32_mlir_vector_avx2": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": ["(avx2)"]
},
"gemm_fp32_mlir_vector_sve": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=4,32 '" ],
"extensions": ["(asimd)"]
},
"gemm_bf16_dp2_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ],
Expand All @@ -64,13 +78,27 @@
"flags": [ "-n", "100" ],
"extensions": []
},
"mlp_fp32_mlir_vector": {
"mlp_fp32_mlir_vector_avx512": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": []
},
"mlp_fp32_mlir_vector_avx2": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": ["(avx2)" ]
},
"mlp_fp32_mlir_vector_sve": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --registerBlocking=4,32 '" ],
"extensions": ["(asimd)"]
},
"mlp_bf16_dp2_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ],
Expand Down Expand Up @@ -99,7 +127,7 @@
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_args_mlir": {
Expand All @@ -113,7 +141,7 @@
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ "(avx2|asimd)" ]
},
"bf16_3x1024_const_mlir": {
Expand Down Expand Up @@ -144,7 +172,7 @@
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_args_mlir": {
Expand All @@ -158,7 +186,7 @@
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ "(avx2|asimd)" ]
},
"bf16_3x1024_const_mlir": {
Expand Down
16 changes: 8 additions & 8 deletions benchmarks/config/omp/mlir-fp32-vector-to-kernel.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,28 +129,28 @@
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
}
}},
Expand All @@ -160,28 +160,28 @@
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,32 '" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=4,16 '" ],
"extensions": [ "(avx2)" ]
}
}},
Expand Down
16 changes: 8 additions & 8 deletions benchmarks/config/omp/torch-dynamo-vector-to-kernel.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,28 @@
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
}
}},
Expand All @@ -36,28 +36,28 @@
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --registerBlocking=8,32 '" ],
"extensions": [ ]
}
}}
Expand Down
6 changes: 2 additions & 4 deletions include/TPP/PassBundles.td
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,8 @@ def DefaultTppPasses : Pass<"default-tpp-passes", "ModuleOp"> {
Option<"lowerPackUnpackWithoutTranspose", "lower-pack-unpack-without-transpose",
"bool", /*default=*/"false",
"Lower non-constant packs and unpacks reverting any dim permutations.">,
ListOption<"lhsTile", "lhsTile",
"unsigned", "Lhs tile size for brgemm operation.">,
ListOption<"rhsTile", "rhsTile",
"unsigned", "Rhs tile size for brgemm operation.">,
ListOption<"registerBlocking", "registerBlocking",
"unsigned", "Register blocking tile sizes for brgemm operation.">,

];
}
Expand Down
5 changes: 2 additions & 3 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,13 @@ def VectorContractToFMA : Pass<
def BrgemmLinalgTiling : Pass<"tile-brgemm-linalg"> {
let summary = "Tile bregmm matmul and reduction dimension.";
let description = [{
Tiles the innermost dimensions of the batch reduce matmul operation. Additionally, it swaps the reduction and k dimension loop. The final loop structure is as follows: M-loop->N-loop->reduction-loop->K-loop. For example: --tile-brgemm-linalg="lhsTile=8,8 rhsTile=8,16".
Tiles the innermost dimensions of the batch reduce matmul operation to support perfect register allocation. Additionally, it swaps the reduction and k dimension loop. The final loop structure is as follows: M-loop->N-loop->reduction-loop->K-loop. For example: --tile-brgemm-linalg="registerBlocking=<mTileSize>, <nTileSize>".
}];
let dependentDialects = ["linalg::LinalgDialect",
"memref::MemRefDialect",
"arith::ArithDialect"];
let options = [
ListOption<"mTileShape", "lhsTile", "unsigned", "Input for the tile shape of m x k dim. Tile size should not be greater than the dimension size. Example: lhsTile=8,8">,
ListOption<"nTileShape", "rhsTile", "unsigned", "Input for the tile shape of k x n dim. Tile size should not be greater than the dimension size. Example: rhsTile=8,16">,
ListOption<"registerTileShape", "registerBlocking", "unsigned", "Input for the register blocking tile shapes for a brgemm operation">,
];
}

Expand Down
17 changes: 5 additions & 12 deletions lib/TPP/DefaultPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,13 @@ llvm::cl::opt<bool> lowerPackUnpackWithoutTranspose(
llvm::cl::desc("Lower packs and unpacks reverting any dim permutations"),
llvm::cl::init(false));

// Lhs tile sizes for linalg-to-vector.
llvm::cl::list<unsigned>
lhsTile("lhsTile", llvm::cl::desc("Lhs tile size for brgemm operation"),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 8}),
llvm::cl::CommaSeparated);

// Rhs tile sizes for linalg-to-vector
llvm::cl::list<unsigned>
rhsTile("rhsTile", llvm::cl::desc("Rhs tile size for brgemm operation"),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 16}),
registerBlocking("registerBlocking", llvm::cl::desc("Register blocking tile sizes for brgemm operation"),
llvm::cl::list_init<unsigned>(SmallVector<unsigned>{8, 32}),
llvm::cl::CommaSeparated);


llvm::cl::opt<bool> vectorToXSMM("vector-to-XSMM",
llvm::cl::desc("Lower vector to XSMM"),
llvm::cl::init(false));
Expand Down Expand Up @@ -160,10 +155,8 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
tppDefaultOptions.linalgToVector = linalgToVector;
tppDefaultOptions.vectorToXSMM = vectorToXSMM;
tppDefaultOptions.lowerPackUnpackWithoutTranspose = lowerPackUnpackWithoutTranspose;
tppDefaultOptions.lhsTile =
SmallVector<unsigned>{lhsTile.begin(), lhsTile.end()};
tppDefaultOptions.rhsTile =
SmallVector<unsigned>{rhsTile.begin(), rhsTile.end()};
tppDefaultOptions.registerBlocking =
SmallVector<unsigned>{registerBlocking.begin(), registerBlocking.end()};
tppDefaultOptions.vectorToKernel = vectorToKernel;

pm.addPass(createDefaultTppPasses(tppDefaultOptions));
Expand Down
3 changes: 1 addition & 2 deletions lib/TPP/DefaultTppPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,7 @@ struct DefaultTppPasses
if (linalgToVector || forceLinalgToVector) {
// Vectorizes the remaining Linalg operations
pm.addNestedPass<func::FuncOp>(createBrgemmLinalgTiling(
BrgemmLinalgTilingOptions{SmallVector<unsigned>{*lhsTile},
SmallVector<unsigned>{*rhsTile}}));
BrgemmLinalgTilingOptions{SmallVector<unsigned>{*registerBlocking}}));
pm.addNestedPass<func::FuncOp>(createLoopInvariantCodeMotionPass());
pm.addNestedPass<func::FuncOp>(createVectorizationPass());

Expand Down
Loading

0 comments on commit 1806177

Please sign in to comment.