Skip to content

Commit 114add9

Browse files
[XPU][OptRed] Define triton_intel_gpu.simd_reduce and use in optimized transposed reduction
Define SIMD transpose-reduce operation performing a SIMD reduction while transposing the implicit SIMD matrix. See description definition for further context. Using this operation in the transpose reduction pass allows us to perform the optimization while not using SLM. Signed-off-by: victor-eds <[email protected]> Co-authored-by: chengjunlu <[email protected]> Signed-off-by: Victor Perez <[email protected]>
1 parent 6588f0d commit 114add9

File tree

10 files changed

+1486
-161
lines changed

10 files changed

+1486
-161
lines changed
+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
2+
3+
// Basic 16x16 SIMD reduction.
4+
5+
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
6+
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
7+
8+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
9+
// CHECK-LABEL: llvm.func spir_kernelcc @test_single(
10+
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct
11+
// CHECK: %[[VAL_17:.*]] = llvm.mlir.poison : vector<16xf32>
12+
// COM: Check we insert all tensor elements in a vector:
13+
// CHECK-COUNT-16: llvm.insertelement
14+
// CHECK: %[[VAL_50:.*]] = llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "{\0A.decl temp_result v_type=G type=f num_elts=128 align=wordx32\0Aadd (M1_NM, 16) temp_result(0, 0)<1> $1(0, 0)<16;8,1> $1(0, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(1, 0)<1> $1(2, 0)<16;8,1> $1(2, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(2, 0)<1> $1(4, 0)<16;8,1> $1(4, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(3, 0)<1> $1(6, 0)<16;8,1> $1(6, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(4, 0)<1> $1(8, 0)<16;8,1> $1(8, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(5, 0)<1> $1(10, 0)<16;8,1> $1(10, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(6, 0)<1> $1(12, 0)<16;8,1> $1(12, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(7, 0)<1> $1(14, 0)<16;8,1> $1(14, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(0, 0)<1> temp_result(0, 0)<8;4,1> temp_result(0, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(1, 0)<1> temp_result(2, 0)<8;4,1> temp_result(2, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(2, 0)<1> temp_result(4, 0)<8;4,1> temp_result(4, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(3, 0)<1> temp_result(6, 0)<8;4,1> temp_result(6, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(0, 0)<1> temp_result(0, 0)<4;2,1> temp_result(0, 2)<4;2,1>\0Aadd (M1_NM, 16) temp_result(1, 0)<1> temp_result(2, 0)<4;2,1> temp_result(2, 2)<4;2,1>\0Aadd (M1_NM, 16) $0(0, 0)<1> temp_result(0, 0)<2;1,0> temp_result(0, 1)<2;1,0>\0A}", "=rw,rw" %{{.*}} : (vector<16xf32>) -> f32
15+
// COM: Check we obtain a single result, i.e., the SIMD reduction minimizes register usage.
16+
// CHECK: %[[VAL_51:.*]] = llvm.mlir.undef : !llvm.struct<(f32)>
17+
// CHECK: %[[VAL_52:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_51]][0] : !llvm.struct<(f32)>
18+
// CHECK: llvm.return %[[VAL_52]] : !llvm.struct<(f32)>
19+
// CHECK: }
20+
tt.func @test_single(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16xf32, #blocked1> {
21+
%0 = triton_intel_gpu.simd_reduce add %arg0 axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
22+
tt.return %0 : tensor<16xf32, #blocked1>
23+
}
24+
}

test/TritonIntelGPU/optimize-reduction-simd.mlir

+289
Large diffs are not rendered by default.

test/TritonIntelGPU/optimize-reduction.mlir

+153-150
Large diffs are not rendered by default.

test/TritonIntelGPU/tritonintelgpu.mlir

+14
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
5858
tt.return %res : tensor<16x16xf16>
5959
}
6060
}
61+
62+
// -----
63+
64+
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
65+
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
66+
67+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
68+
tt.func @triton_intel_gpu.simd_reduce(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16xf32, #blocked1> {
69+
// CHECK-LABEL: @triton_intel_gpu.simd_reduce
70+
// CHECK: triton_intel_gpu.simd_reduce add %{{.*}} axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
71+
%0 = triton_intel_gpu.simd_reduce add %arg0 axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
72+
tt.return %0 : tensor<16xf32, #blocked1>
73+
}
74+
}

third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td

+63
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
include "triton/Dialect/Triton/IR/TritonTypes.td"
1313
include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
1414
include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
15+
include "intel/include/Dialect/TritonGEN/IR/TritonGENAttrDefs.td"
1516
include "intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td"
1617
include "intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td"
1718
include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -202,4 +203,66 @@ def TTIG_SubGroupTransposeOp
202203
let hasVerifier = 1;
203204
}
204205

206+
def TTIG_SIMDReduceOp : TTIG_Op<"simd_reduce", [Pure, SameOperandsAndResultElementType]> {
207+
let summary = "SIMD reduction.";
208+
let description = [{
209+
The `triton_intel_gpu.simd_reduce` operation performs a SIMD reduction.
210+
Contrary to `tt.reduce`, when performing a warp reduction, the result is
211+
non-uniform.
212+
213+
The reduction axis must be in such a way that only a warp reduction is
214+
performed, i.e., `sizePerThread[axis]`, `warpsPerCTA[axis]` and
215+
`CTAsPerCGA[axis]` must be 1; and `shape[axis]` and `threadsPerWarp[axis]`
216+
must be equal to the sub-group size.
217+
218+
The output type must be compatible with the performed reduction. However,
219+
ensuring this is up to the user. As a rule of thumb, the total number of
220+
elements in the output tensor must be sub-group size smaller than in the
221+
original one. Users should bear in mind a tensor like:
222+
223+
```
224+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
225+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
226+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
227+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
228+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
229+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
230+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
231+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
232+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
233+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
234+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
235+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
236+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
237+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
238+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
239+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
240+
```
241+
242+
would be reduced to:
243+
244+
```
245+
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
246+
```
247+
248+
Example:
249+
```mlir
250+
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
251+
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
252+
triton_intel_gpu.simd_reduce add %0 axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
253+
// # 3D reduction:
254+
#blocked = #ttg.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [1, 1, 2], order = [0, 1, 2]}>
255+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 1], warpsPerCTA = [1, 2], order = [0, 1]}>
256+
triton_intel_gpu.simd_reduce add %0 axis = 0 : tensor<16x16x2xf32, #blocked> -> tensor<16x2xf32, #blocked1>
257+
```
258+
}];
259+
let arguments = (ins TT_Tensor:$src,
260+
TritonGEN_ReduceKindAttr: $op,
261+
I32Attr:$axis);
262+
let results = (outs TT_Tensor:$res);
263+
let assemblyFormat = [{
264+
$op $src `axis` `=` $axis attr-dict `:` type($src) `->` type($res)
265+
}];
266+
}
267+
205268
#endif

0 commit comments

Comments
 (0)