From 9514fc24a1f655c2d6d847aa5533005fb71231a5 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Fri, 10 Jan 2025 16:40:49 +0100 Subject: [PATCH] WIP update tests --- test/BF16/Integration/matmul-pbf16.mlir | 50 ------- test/BF16/Integration/mlir-gen-bf16.mlir | 28 ++-- .../BF16/Integration/mlp-all-bf16-tpprun.mlir | 137 ------------------ .../BF16/Integration/tpp-run-splat-shape.mlir | 2 +- test/BF16/Integration/vnni-xsmm-vs-loops.mlir | 29 +--- 5 files changed, 23 insertions(+), 223 deletions(-) delete mode 100644 test/BF16/Integration/matmul-pbf16.mlir delete mode 100644 test/BF16/Integration/mlp-all-bf16-tpprun.mlir diff --git a/test/BF16/Integration/matmul-pbf16.mlir b/test/BF16/Integration/matmul-pbf16.mlir deleted file mode 100644 index f2434271d..000000000 --- a/test/BF16/Integration/matmul-pbf16.mlir +++ /dev/null @@ -1,50 +0,0 @@ -// RUN: tpp-run %s -print \ -// RUN: -e entry -entry-point-result=void | \ -// RUN: FileCheck %s - -#map = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2, d0)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> - -func.func @matmultpp(%A: memref<4x8xbf16>, - %B: memref<4x4x2xbf16>, %C: memref<4x4xbf16>) { - %expanded = memref.expand_shape %A [[0], [1, 2]] output_shape [4, 4, 2] - : memref<4x8xbf16> into memref<4x4x2xbf16> - linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["reduction", "parallel", "parallel", "reduction"]} - ins(%expanded, %B : memref<4x4x2xbf16>, memref<4x4x2xbf16>) - outs(%C : memref<4x4xbf16>) { - ^bb0(%in: bf16, %in_2: bf16, %out: bf16): - %1 = arith.mulf %in, %in_2 : bf16 - %2 = arith.addf %out, %1 : bf16 - linalg.yield %2 : bf16 - } - return -} - -func.func @entry() { - %c0 = arith.constant 0 : index - %f0 = arith.constant 1.0 : bf16 - %da = memref.alloc() :memref<4x8xbf16> - linalg.fill ins(%f0 : bf16) outs(%da : memref<4x8xbf16>) - // Call kernel. - %0 = memref.alloc() : memref<4x4x2xbf16> - linalg.fill ins(%f0:bf16) outs (%0: memref<4x4x2xbf16>) - %D = memref.alloc() : memref<4x4xbf16> - %zero = arith.constant 0.0 : bf16 - linalg.fill ins(%zero : bf16) outs(%D:memref<4x4xbf16>) - call @matmultpp(%da, %0, %D) - : (memref<4x8xbf16>, memref<4x4x2xbf16>, memref<4x4xbf16>)->() - - // - // CHECK:( ( 8, 8, 8, 8 ), ( 8, 8, 8, 8 ), ( 8, 8, 8, 8 ), ( 8, 8, 8, 8 ) ) - // - %d1 = arith.constant -1.0 : bf16 - - %v0 = vector.transfer_read %D[%c0, %c0], %d1 : memref<4x4xbf16>, vector<4x4xbf16> - %f1 = arith.extf %v0:vector<4x4xbf16> to vector<4x4xf32> - vector.print %f1 : vector<4x4xf32> - - return -} diff --git a/test/BF16/Integration/mlir-gen-bf16.mlir b/test/BF16/Integration/mlir-gen-bf16.mlir index a0db89a6b..97035a7d1 100644 --- a/test/BF16/Integration/mlir-gen-bf16.mlir +++ b/test/BF16/Integration/mlir-gen-bf16.mlir @@ -1,28 +1,28 @@ // MLP without softmax (can't print packed version for now) -// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10,10 --float-type=bf16 | tpp-run -e entry -entry-point-result=void -// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10,10 --float-type=bf16 | tpp-run -e entry -entry-point-result=void +// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16,16 --float-type=bf16 | tpp-run -e entry -entry-point-result=void +// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16,16 --float-type=bf16 | tpp-run -e entry -entry-point-result=void // Matmul only -// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10 --float-type=bf16 | tpp-run -e entry -entry-point-result=void -// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10 --float-type=bf16 | tpp-run -e entry -entry-point-result=void +// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 --float-type=bf16 | tpp-run -e entry -entry-point-result=void +// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 --float-type=bf16 | tpp-run -e entry -entry-point-result=void // Kernel - matmul -// RUN: mlir-gen --kernel=args --seed=123 --float-type=bf16 --batch=10 --layers=10,10 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-MATMUL-BF16 -// RUN: mlir-gen --output=named --kernel=args --seed=123 --float-type=bf16 --batch=10 --layers=10,10 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-MATMUL-BF16 +// RUN: mlir-gen --kernel=args --seed=123 --float-type=bf16 --batch=16 --layers=16,16 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-MATMUL-BF16 +// RUN: mlir-gen --output=named --kernel=args --seed=123 --float-type=bf16 --batch=16 --layers=16,16 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-MATMUL-BF16 // Kernel - fc -// RUN: mlir-gen --kernel=args --bias --relu --seed=123 --float-type=bf16 --batch=10 --layers=10,10 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-FC-BF16 -// RUN: mlir-gen --output=named --kernel=args --bias --relu --seed=123 --float-type=bf16 --batch=10 --layers=10,10 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-FC-BF16 +// RUN: mlir-gen --kernel=args --bias --relu --seed=123 --float-type=bf16 --batch=16 --layers=16,16 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-FC-BF16 +// RUN: mlir-gen --output=named --kernel=args --bias --relu --seed=123 --float-type=bf16 --batch=16 --layers=16,16 | tpp-run -e entry -entry-point-result=void -print | FileCheck %s --check-prefix=GEN-FC-BF16 // BF16/VNNI execution -// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10 --tiles=2,2,2 --float-type=bf16 | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF -// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10 --tiles=2,2,2 --float-type=bf16 | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF -// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10 --tiles=2,2,2 --float-type=bf16 | tpp-opt --pack-vnni | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF -// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=10 --layers=10,10 --tiles=2,2,2 --float-type=bf16 | tpp-opt --pack-vnni | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF +// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 --tiles=8,8,8 --float-type=bf16 | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF +// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 --tiles=8,8,8 --float-type=bf16 | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF +// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 --tiles=8,8,8 --float-type=bf16 | tpp-opt --pack-vnni | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF +// RUN: mlir-gen --output=named --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 --tiles=8,8,8 --float-type=bf16 | tpp-opt --pack-vnni | tpp-run -e entry -entry-point-result=void -n 10 | FileCheck %s --check-prefix=PERF -// GEN-MATMUL-BF16: ( 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 ) +// GEN-MATMUL-BF16: ( 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17 ) -// GEN-FC-BF16: ( 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 ) +// GEN-FC-BF16: ( 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18 ) // PERF: {{[0-9]+}}{{.?}}{{[0-9e-]+}} diff --git a/test/BF16/Integration/mlp-all-bf16-tpprun.mlir b/test/BF16/Integration/mlp-all-bf16-tpprun.mlir deleted file mode 100644 index 5f7968719..000000000 --- a/test/BF16/Integration/mlp-all-bf16-tpprun.mlir +++ /dev/null @@ -1,137 +0,0 @@ -// RUN: tpp-run %s \ -// RUN: -e entry -entry-point-result=void - -memref.global "private" constant @arg1 : memref<128x512x2xbf16> = dense<1.00e+00> -memref.global "private" constant @arg3 : memref<256x1024x2xbf16> = dense<1.00e+00> -memref.global "private" constant @arg5 : memref<512x2048x2xbf16> = dense<1.00e+00> -memref.global "private" constant @arg7 : memref<1024x1000x2xbf16> = dense<1.00e+00> - -#map = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2, d0)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> -#map3 = affine_map<(d0, d1) -> (d0, d1)> -#map4 = affine_map<(d0, d1) -> (d1)> - -func.func @entry(%arg0: memref<128x256xbf16>, %arg2: memref<512xbf16>, %arg4: memref<1024xbf16>, - %arg6: memref<2048xbf16>, %arg8: memref<1000xbf16>, %arg9: memref<128x512xbf16>, - %arg10: memref<128x1024xbf16>, %arg11: memref<128x2048xbf16>, %arg12: memref<128x1000xbf16>) { - %c0 = arith.constant 0.0 : bf16 - linalg.generic { - indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg2: memref<512xbf16>) outs(%arg9: memref<128x512xbf16>) { - ^bb0(%in: bf16, %out: bf16): - linalg.yield %in : bf16 - } - - %e0 = memref.expand_shape %arg0 [[0], [1, 2]] output_shape [128, 128, 2] - : memref<128x256xbf16> into memref<128x128x2xbf16> - %relayout_arg0 = memref.get_global @arg1:memref<128x512x2xbf16> - linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["reduction", "parallel", "parallel", "reduction"]} - ins(%e0, %relayout_arg0 : memref<128x128x2xbf16>, memref<128x512x2xbf16>) - outs(%arg9 : memref<128x512xbf16>) { - ^bb0(%in: bf16, %in_2: bf16, %out: bf16): - %1 = arith.mulf %in, %in_2 : bf16 - %2 = arith.addf %out, %1 : bf16 - linalg.yield %2 : bf16 - } - linalg.generic { - indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg9 : memref<128x512xbf16>) outs(%arg9 : memref<128x512xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %2 = arith.maximumf %in, %c0 : bf16 - linalg.yield %2 : bf16 - } - - linalg.generic { - indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg4: memref<1024xbf16>) outs(%arg10: memref<128x1024xbf16>) { - ^bb0(%in: bf16, %out: bf16): - linalg.yield %in : bf16 - } - - %e1 = memref.expand_shape %arg9 [[0], [1, 2]] output_shape [128, 256, 2] - : memref<128x512xbf16> into memref<128x256x2xbf16> - %relayout_arg12 = memref.get_global @arg3:memref<256x1024x2xbf16> - linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["reduction", "parallel", "parallel", "reduction"]} - ins(%e1, %relayout_arg12 : memref<128x256x2xbf16>, memref<256x1024x2xbf16>) - outs(%arg10 : memref<128x1024xbf16>) { - ^bb0(%in: bf16, %in_2: bf16, %out: bf16): - %1 = arith.mulf %in, %in_2 : bf16 - %2 = arith.addf %out, %1 : bf16 - linalg.yield %2 : bf16 - } - linalg.generic { - indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg10 : memref<128x1024xbf16>) outs(%arg10 : memref<128x1024xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %2 = arith.maximumf %in, %c0 : bf16 - linalg.yield %2 : bf16 - } - - linalg.generic { - indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg6: memref<2048xbf16>) outs(%arg11: memref<128x2048xbf16>) { - ^bb0(%in: bf16, %out: bf16): - linalg.yield %in : bf16 - } - - %relayout_arg11 = memref.get_global @arg5:memref<512x2048x2xbf16> - %e2 = memref.expand_shape %arg10 [[0], [1, 2]] output_shape [128, 512, 2] - : memref<128x1024xbf16> into memref<128x512x2xbf16> - linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["reduction", "parallel", "parallel", "reduction"]} - ins(%e2, %relayout_arg11 : memref<128x512x2xbf16>, memref<512x2048x2xbf16>) - outs(%arg11 : memref<128x2048xbf16>) { - ^bb0(%in: bf16, %in_2: bf16, %out: bf16): - %1 = arith.mulf %in, %in_2 : bf16 - %2 = arith.addf %out, %1 : bf16 - linalg.yield %2 : bf16 - } - linalg.generic { - indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg11 : memref<128x2048xbf16>) outs(%arg11 : memref<128x2048xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %2 = arith.maximumf %in, %c0 : bf16 - linalg.yield %2 : bf16 - } - - linalg.generic { - indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg8: memref<1000xbf16>) outs(%arg12: memref<128x1000xbf16>) { - ^bb0(%in: bf16, %out: bf16): - linalg.yield %in : bf16 - } - - %relayout_arg10 = memref.get_global @arg7:memref<1024x1000x2xbf16> - %e3 = memref.expand_shape %arg11 [[0], [1, 2]] output_shape [128, 1024, 2] - : memref<128x2048xbf16> into memref<128x1024x2xbf16> - linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["reduction", "parallel", "parallel", "reduction"]} - ins(%e3, %relayout_arg10 : memref<128x1024x2xbf16>, memref<1024x1000x2xbf16>) - outs(%arg12 : memref<128x1000xbf16>) { - ^bb0(%in: bf16, %in_2: bf16, %out: bf16): - %1 = arith.mulf %in, %in_2 : bf16 - %2 = arith.addf %out, %1 : bf16 - linalg.yield %2 : bf16 - } - linalg.generic { - indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} - ins(%arg12 : memref<128x1000xbf16>) outs(%arg12 : memref<128x1000xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %2 = arith.maximumf %in, %c0 : bf16 - linalg.yield %2 : bf16 - } - - %threshold = arith.constant 1.0 : bf16 - %c4 = arith.constant 2.74878e+11: bf16 - %interim4 = memref.alloc(): memref<128x1000xbf16> - linalg.fill ins(%c4:bf16) outs(%interim4: memref<128x1000xbf16>) - check.expect_almost_eq(%interim4, %arg12, %threshold): memref<128x1000xbf16>, memref<128x1000xbf16>, bf16 - return -} diff --git a/test/BF16/Integration/tpp-run-splat-shape.mlir b/test/BF16/Integration/tpp-run-splat-shape.mlir index 624aeb754..935586599 100644 --- a/test/BF16/Integration/tpp-run-splat-shape.mlir +++ b/test/BF16/Integration/tpp-run-splat-shape.mlir @@ -41,7 +41,7 @@ func.func @entry(%arg0: tensor<4x8x8x8xbf16>, %output: tensor<4x8x8x8xbf16>) -> // due to compile time packing. // CHECK-NOT: memref.global "private" constant @__constant_{{.*}}: memref<8x8xbf16> // CHECK-DAG: memref.global "private" constant @__constant_{{.*}}: memref<4x8x8x8xbf16> -// CHECK-DAG: memref.global "private" constant @__constant_{{.*}}: memref<8x8x4x8x2xbf16> +// CHECK-DAG: memref.global "private" constant @__constant_{{.*}}: memref<8x8x4x8x{{[2|4|8]}}xbf16> // CHECK: xsmm_brgemm_invoke // CHECK: xsmm_binary_invoke // CHECK: xsmm_unary_invoke diff --git a/test/BF16/Integration/vnni-xsmm-vs-loops.mlir b/test/BF16/Integration/vnni-xsmm-vs-loops.mlir index 2a8419395..0f7eb99d1 100644 --- a/test/BF16/Integration/vnni-xsmm-vs-loops.mlir +++ b/test/BF16/Integration/vnni-xsmm-vs-loops.mlir @@ -1,26 +1,13 @@ -// RUN: tpp-run %s -print -seed 123 \ +// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 \ +// RUN: --tiles=16,16,16 --float-type=bf16 | \ +// RUN: tpp-opt --pack-vnni | \ +// RUN: tpp-run -print -seed 123 \ // RUN: -e entry -entry-point-result=void > %t.xsmm -// RUN: tpp-run %s -print -seed 123 -linalg-to-loops \ +// RUN: mlir-gen --kernel=const --bias --relu --seed=123 --batch=16 --layers=16,16 \ +// RUN: --tiles=16,16,16 --float-type=bf16 | \ +// RUN: tpp-opt --pack-vnni | \ +// RUN: tpp-run -print -seed 123 -linalg-to-loops \ // RUN: -e entry -entry-point-result=void > %t.loops // RUN: fpcmp -r 0.01 %t.xsmm %t.loops - -#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6, d3)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6, d5, d3)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> - -func.func @entry(%arg0: tensor<2x2x7x4x2xbf16>, %arg1: tensor<2x2x4x5x2xbf16>, - %arg2: tensor<2x2x7x5xbf16>) -> tensor<2x2x7x5xbf16> { - %1 = linalg.generic { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} - ins(%arg0, %arg1 : tensor<2x2x7x4x2xbf16>, tensor<2x2x4x5x2xbf16>) - outs(%arg2 : tensor<2x2x7x5xbf16>) { - ^bb0(%in: bf16, %in_0: bf16, %out: bf16): - %2 = arith.mulf %in, %in_0 : bf16 - %3 = arith.addf %out, %2 : bf16 - linalg.yield %3 : bf16 - } -> tensor<2x2x7x5xbf16> - return %1 : tensor<2x2x7x5xbf16> -}