FlagOpen · Galaxy1458 · Nov 16, 2024 · Nov 16, 2024 · Nov 16, 2024 · Nov 16, 2024
diff --git a/src/flag_gems/__init__.py b/src/flag_gems/__init__.py
diff --git a/src/flag_gems/ops/addmm.py b/src/flag_gems/ops/addmm.py
@@ -4,54 +4,14 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
 
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64},
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32},
-            num_stages=5,
-            num_warps=2,
-        ),
-    ],
+    configs=runtime.get_op_tune_config("addmm"),
     key=["M", "N", "K"],
 )
 @triton.jit(do_not_specialize=["alpha", "beta"])

diff --git a/src/flag_gems/ops/all.py b/src/flag_gems/ops/all.py
@@ -5,19 +5,13 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import dim_compress, libentry
 from ..utils import triton_lang_extension as tle
 
-
 # torch.all: Tests if all elements in input evaluate to True. If the dtype of input
 #            is not BOOL, then test if all elements in input evaluate to non-zero value
 # In triton function, test if all elements in input evaluate to non-zero value is ok.
-def cfggen():
-    block_m = [1, 2, 4, 8]
-    configs = [
-        triton.Config({"BLOCK_M": m, "BLOCK_N": 1024}, num_warps=4) for m in block_m
-    ]
-    return configs
 
 
 @triton.jit
@@ -26,7 +20,7 @@ def reduce_all(a, b):
 
 
 @libentry()
-@triton.autotune(configs=cfggen(), key=["M", "N"])
+@triton.autotune(configs=runtime.get_op_tune_config("all"), key=["M", "N"])
 @triton.jit
 def all_kernel_dim(
     inp,

diff --git a/src/flag_gems/ops/amax.py b/src/flag_gems/ops/amax.py
@@ -5,6 +5,7 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import dim_compress, libentry
 from ..utils import triton_lang_extension as tle
 
@@ -18,6 +19,7 @@ def amax_kernel_1(
     BLOCK_SIZE: tl.constexpr,
 ):
     pid = tle.program_id(0)
+
     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     inp_ptrs = inp + offset
     mask = offset < M
@@ -38,16 +40,8 @@ def amax_kernel_2(mid, out, mid_size, BLOCK_MID: tl.constexpr):
     tl.store(out, amax_val)
 
 
-def cfggen():
-    block_m = [1, 2, 4, 8]
-    configs = [
-        triton.Config({"BLOCK_M": m, "BLOCK_N": 1024}, num_warps=4) for m in block_m
-    ]
-    return configs
-
-
 @libentry()
-@triton.autotune(configs=cfggen(), key=["M", "N"])
+@triton.autotune(configs=runtime.get_op_tune_config("amax"), key=["M", "N"])
 @triton.jit
 def amax_kernel(
     inp,

diff --git a/src/flag_gems/ops/any.py b/src/flag_gems/ops/any.py
@@ -5,19 +5,13 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import dim_compress, libentry
 from ..utils import triton_lang_extension as tle
 
-
 # torch.any: Tests if any elements in input evaluate to True. If the dtype of input
 #            is not BOOL, then test if any elements in input evaluate to non-zero value
 # In triton function, test if any elements in input evaluate to non-zero value is ok.
-def cfggen():
-    block_m = [1, 2, 4, 8]
-    configs = [
-        triton.Config({"BLOCK_M": m, "BLOCK_N": 1024}, num_warps=4) for m in block_m
-    ]
-    return configs
 
 
 @triton.jit
@@ -26,7 +20,7 @@ def reduce_any(a, b):
 
 
 @libentry()
-@triton.autotune(configs=cfggen(), key=["M", "N"])
+@triton.autotune(configs=runtime.get_op_tune_config("any"), key=["M", "N"])
 @triton.jit
 def any_kernel_dim(
     inp,

diff --git a/src/flag_gems/ops/argmax.py b/src/flag_gems/ops/argmax.py
@@ -5,6 +5,7 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
 
@@ -50,11 +51,7 @@ def heur_block_n(args):
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 8}, num_warps=8),
-        triton.Config({"BLOCK_M": 16}, num_warps=8),
-        triton.Config({"BLOCK_M": 32}, num_warps=8),
-    ],
+    configs=runtime.get_op_tune_config("argmax"),
     key=[
         "M",
         "N",

diff --git a/src/flag_gems/ops/bmm.py b/src/flag_gems/ops/bmm.py
@@ -4,6 +4,7 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
 
@@ -22,68 +23,7 @@ def heur_divisible_k(args):
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config(
-            {"TILE_M": 32, "TILE_N": 32, "TILE_K": 32, "GROUP_M": 1},
-            num_warps=4,
-            num_stages=2,
-        ),
-        triton.Config(
-            {"TILE_M": 64, "TILE_N": 32, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=2,
-        ),
-        triton.Config(
-            {"TILE_M": 64, "TILE_N": 64, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=2,
-        ),
-        triton.Config(
-            {"TILE_M": 128, "TILE_N": 32, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=2,
-        ),
-        triton.Config(
-            {"TILE_M": 128, "TILE_N": 64, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=2,
-        ),
-        triton.Config(
-            {"TILE_M": 128, "TILE_N": 128, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=2,
-        ),
-        triton.Config(
-            {"TILE_M": 32, "TILE_N": 32, "TILE_K": 32, "GROUP_M": 1},
-            num_warps=4,
-            num_stages=3,
-        ),
-        triton.Config(
-            {"TILE_M": 64, "TILE_N": 32, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=3,
-        ),
-        triton.Config(
-            {"TILE_M": 64, "TILE_N": 64, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=3,
-        ),
-        triton.Config(
-            {"TILE_M": 128, "TILE_N": 32, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=3,
-        ),
-        triton.Config(
-            {"TILE_M": 128, "TILE_N": 64, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=3,
-        ),
-        triton.Config(
-            {"TILE_M": 128, "TILE_N": 128, "TILE_K": 32, "GROUP_M": 2},
-            num_warps=4,
-            num_stages=3,
-        ),
-    ],
+    configs=runtime.get_op_tune_config("bmm"),
     key=["M", "N", "K"],
 )
 @triton.heuristics(

diff --git a/src/flag_gems/ops/cross_entropy_loss.py b/src/flag_gems/ops/cross_entropy_loss.py
@@ -4,17 +4,14 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
 
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_C": c, "BLOCK_D": d}, num_warps=4)
-        for c in [256, 512, 1024]
-        for d in [1, 4, 16]
-    ],
+    configs=runtime.get_op_tune_config("cross_entropy_loss"),
     key=["C", "D"],
 )
 @triton.jit(do_not_specialize=["ignore_index"])
@@ -75,11 +72,7 @@ def celoss_indices_kernel(
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_C": c, "BLOCK_D": d}, num_warps=4)
-        for c in [256, 512, 1024]
-        for d in [1, 4, 16]
-    ],
+    configs=runtime.get_op_tune_config("cross_entropy_loss"),
     key=["C", "D"],
 )
 @triton.jit(do_not_specialize=["label_smoothing"])
@@ -138,11 +131,7 @@ def celoss_probability_kernel(
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_C": c, "BLOCK_D": d}, num_warps=4)
-        for c in [256, 512, 1024]
-        for d in [1, 4, 16]
-    ],
+    configs=runtime.get_op_tune_config("cross_entropy_loss"),
     key=["C", "D"],
 )
 @triton.jit(do_not_specialize=["ignore_index", "label_smoothing"])
@@ -223,11 +212,7 @@ def celoss_indices_smooth_kernel(
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_C": c, "BLOCK_D": d}, num_warps=4)
-        for c in [256, 512, 1024]
-        for d in [1, 4, 16]
-    ],
+    configs=runtime.get_op_tune_config("cross_entropy_loss"),
     key=["C", "D"],
 )
 @triton.jit(do_not_specialize=["ignore_index", "mean_num"])
@@ -298,11 +283,7 @@ def celoss_indices_bwd(
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_C": c, "BLOCK_D": d}, num_warps=4)
-        for c in [256, 512, 1024]
-        for d in [1, 4, 16]
-    ],
+    configs=runtime.get_op_tune_config("cross_entropy_loss"),
     key=["C", "D"],
 )
 @triton.jit(do_not_specialize=["label_smoothing", "mean_num"])
@@ -387,11 +368,7 @@ def celoss_probability_bwd(
 
 @libentry()
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_C": c, "BLOCK_D": d}, num_warps=4)
-        for c in [256, 512, 1024]
-        for d in [1, 4, 16]
-    ],
+    configs=runtime.get_op_tune_config("cross_entropy_loss"),
     key=["C", "D"],
 )
 @triton.jit(do_not_specialize=["ignore_index", "label_smoothing", "mean_num"])

diff --git a/src/flag_gems/ops/gather.py b/src/flag_gems/ops/gather.py
@@ -16,7 +16,9 @@ def generate_imports(code: IndentedBuffer) -> IndentedBuffer:
     code.writeline("import triton.language as tl")
     code.newline()
     code.writeline("from flag_gems.utils import libentry")
+    code.writeline("from flag_gems import runtime")
     code.writeline("from flag_gems.utils import triton_lang_extension as tle")
+
     code.newline()
     code.newline()
     return code
@@ -31,24 +33,15 @@ def generate_gather_kernel(
     code.newline()
 
     # the autotune function
-    code.writeline("def cfggen():")
-    with code.indent():
-        code.writeline("block_m = [1, 2, 4, 8]")
-        code.writeline("block_n = [256, 512, 1024, 2048]")
-        code.writeline("configs = [")
-        with code.indent():
-            code.writeline('triton.Config({"BLOCK_M": m, "BLOCK_N": n}, num_warps=4)')
-            code.writeline("for m in block_m")
-            code.writeline("for n in block_n")
-        code.writeline("]")
-        code.writeline("return configs")
 
     code.newline()
     code.newline()
 
     # the decorators
     code.writeline("@libentry()")
-    code.writeline('@triton.autotune(configs=cfggen(), key=["M", "N"])')
+    code.writeline(
+        '@triton.autotune(configs=runtime.get_op_tune_config("gather"), key=["M", "N"])'
+    )
     code.writeline("@triton.jit")
 
     # signature

diff --git a/src/flag_gems/ops/index_select.py b/src/flag_gems/ops/index_select.py
@@ -4,23 +4,13 @@
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import dim_compress, libentry
 from ..utils import triton_lang_extension as tle
 
 
-def cfggen():
-    block_m = [1, 2, 4]
-    block_n = [1024, 2048, 4096]
-    configs = [
-        triton.Config({"BLOCK_M": m, "BLOCK_N": n}, num_warps=4)
-        for m in block_m
-        for n in block_n
-    ]
-    return configs
-
-
 @libentry()
-@triton.autotune(configs=cfggen(), key=["M", "N"])
+@triton.autotune(configs=runtime.get_op_tune_config("index_select"), key=["M", "N"])
 @triton.jit
 def index_select_kernel(
     inp, out, M, N, index, index_len, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr