[Runtime] solve the problem that llama frequently calls autotuner (#332)

StrongSpoon · Galaxy1458 · web-flow · commit e9c7aa71832e · 2024-12-19T10:52:01.000+08:00
* [operator] turn autotuner to heuristics * [operator] heuristics for gather & index_select * [runtime] libtuner for matmul * [runtime] store config data in one db * [bugfix] parse key as list instead of tuple * [no ci] update var name * [Muti_backend] muti_backend-part_1-framework-and-tune_config (#294) * new feature, muti_backend * update auto_tune_module * update auto_tune_module * update auto_tune_module * update __init__ * rebase * fix bug * modifiy auto_tune_config * fix bug * fix bug * update * update * update scatter&gather * fix auto_tune * add gen_torch_device_fn * fix codestyle * fix codestyle * Modify code based on comments * Modify gen_impl with loops instead of recursion * Update code structure * Polish code * update * Polish code * Modify code based on comments * modify based on comment * Modify code based on comments * update * final fix * [bugfix] update libtuner to be compatible with triton2 * [no ci]reformat * [operator] update log_softmax * [pretune] move pretune to ./examples for models * [format] delete useless print * [format] delete unused import * [format] [no ci] remove useless print --------- Co-authored-by: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
diff --git a/examples/pretune.py b/examples/pretune.py
@@ -0,0 +1,107 @@
+import argparse
+
+import torch
+
+import flag_gems
+
+device = flag_gems.device
+
+DTYPES = [
+    torch.float16,
+    torch.bfloat16,
+    torch.float32,
+]
+
+LLAMA_SHAPES = {
+    "mm": [
+        [1024, 4096],
+        [128256, 4096],
+        [14336, 4096],
+        [4096, 14336],
+        [4096, 4096],
+        [6144, 4096],
+        [28672, 4096],
+    ],
+}
+
+QWEN_SHAPES = {
+    "mm": [
+        [3584, 3584],
+        [18944, 3584],
+        [3584, 18944],
+        [152064, 3584],
+        [37888, 3584],
+    ],
+    "addmm": [
+        [3584, 3584],
+        [512, 3584],
+        [4608, 3584],
+    ],
+}
+
+
+MODEL_SHAPES = {
+    "llama": LLAMA_SHAPES,
+    "qwen": QWEN_SHAPES,
+}
+
+
+def pretune_mm(max_tokens, shapes):
+    for dtype in DTYPES:
+        for M in range(1, max_tokens + 1):
+            for N, K in shapes:
+                tensor_a = torch.randn([M, K], dtype=dtype, device=device)
+                tensor_b = torch.randn([K, N], dtype=dtype, device=device)
+                flag_gems.mm(tensor_a, tensor_b)
+
+
+def pretune_addmm(max_tokens, shapes):
+    for dtype in DTYPES:
+        for M in range(1, max_tokens + 1):
+            for N, K in shapes:
+                tensor_a = torch.randn([M, K], dtype=dtype, device=device)
+                tensor_b = torch.randn([K, N], dtype=dtype, device=device)
+                bias = torch.randn([M, N], dtype=dtype, device=device)
+                flag_gems.addmm(bias, tensor_a, tensor_b)
+
+
+OPERATORS = {
+    "mm": pretune_mm,
+    "addmm": pretune_addmm,
+}
+
+
+def args_parser():
+    parser = argparse.ArgumentParser(
+        description="pretune for gemm",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=False,
+        default="llama",
+        help="model name",
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        required=False,
+        default=100,
+        help="max tokens",
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = args_parser()
+    model = MODEL_SHAPES.get(args.model)
+    max_tokens = args.max_tokens
+    if not model:
+        exit(0)
+    for op, func in OPERATORS.items():
+        shapes = model.get(op)
+        if not shapes:
+            continue
+        func(max_tokens, shapes)
diff --git a/src/flag_gems/ops/argmax.py b/src/flag_gems/ops/argmax.py
@@ -5,7 +5,6 @@
 import triton
 import triton.language as tl
 
-from .. import runtime
 from ..runtime import torch_device_fn
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
@@ -46,20 +45,18 @@ def argmax_kernel_2(mid_value, mid_index, out, mid_size, BLOCK_MID: tl.constexpr
     tl.store(out, out_val)
 
 
+def heur_block_m(args):
+    return 4 if args["M"] < 4096 else 8
+
+
 def heur_block_n(args):
     return min(4096, triton.next_power_of_2(args["N"]))
 
 
 @libentry()
-@triton.autotune(
-    configs=runtime.get_triton_config("argmax"),
-    key=[
-        "M",
-        "N",
-    ],
-)
 @triton.heuristics(
     {
+        "BLOCK_M": heur_block_m,
         "BLOCK_N": heur_block_n,
     }
 )
diff --git a/src/flag_gems/ops/gather.py b/src/flag_gems/ops/gather.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from flag_gems.utils.code_cache import cache_dir
+from flag_gems.utils.code_cache import code_cache_dir
 from flag_gems.utils.code_utils import IndentedBuffer, NameSpace
 from flag_gems.utils.shape_utils import restride_dim
 
@@ -32,16 +32,30 @@ def generate_gather_kernel(
     # make the inlined function visible in the context
     code.newline()
 
-    # the autotune function
+    code.writeline("def heur_block_m(args):")
+    with code.indent():
+        code.writeline(
+            "return min(4, triton.next_power_of_2(triton.cdiv(args['N'], 2048)))"
+        )
+
+    code.newline()
+    code.writeline("def heur_block_n(args):")
+    with code.indent():
+        code.writeline("return min(2048, triton.next_power_of_2(args['N']))")
 
     code.newline()
     code.newline()
 
     # the decorators
     code.writeline("@libentry()")
-    code.writeline(
-        '@triton.autotune(configs=runtime.get_triton_config("gather"), key=["M", "N"])'
-    )
+    code.writeline("@triton.heuristics(")
+    with code.indent():
+        code.writeline("{")
+        with code.indent():
+            code.writeline('"BLOCK_M": heur_block_m,')
+            code.writeline('"BLOCK_N": heur_block_n,')
+        code.writeline("}")
+    code.writeline(")")
     code.writeline("@triton.jit")
 
     # signature
@@ -217,7 +231,7 @@ def __call__(self, *args, **kwargs):
 
             file_name = f"gather_rank_{key}_pid_{self.pid}.py"
 
-            with open(cache_dir() / file_name, "wt", encoding="utf-8") as f:
+            with open(code_cache_dir() / file_name, "wt", encoding="utf-8") as f:
                 f.write(code.getvalue())
 
             # load
diff --git a/src/flag_gems/ops/index_select.py b/src/flag_gems/ops/index_select.py
@@ -4,13 +4,26 @@
 import triton
 import triton.language as tl
 
-from .. import runtime
 from ..utils import dim_compress, libentry
 from ..utils import triton_lang_extension as tle
 
 
+def heur_block_m(args):
+    return min(4, triton.next_power_of_2(triton.cdiv(256, args["N"])))
+
+
+def heur_block_n(args):
+    m = min(triton.next_power_of_2(triton.cdiv(args["N"], 16)), 512)
+    return max(m, 16)
+
+
 @libentry()
-@triton.autotune(configs=runtime.get_triton_config("index_select"), key=["M", "N"])
+@triton.heuristics(
+    {
+        "BLOCK_M": heur_block_m,
+        "BLOCK_N": heur_block_n,
+    }
+)
 @triton.jit
 def index_select_kernel(
     inp, out, M, N, index, index_len, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr
diff --git a/src/flag_gems/ops/log_softmax.py b/src/flag_gems/ops/log_softmax.py
@@ -11,16 +11,15 @@
 
 
 @libentry()
-@triton.autotune(configs=runtime.get_triton_config("log_softmax"), key=["M", "N"])
 @triton.jit
 def log_softmax_kernel(
     output_ptr,
     input_ptr,
     M,
     N,
     K,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
+    BLOCK_M: tl.constexpr = 8,
+    BLOCK_N: tl.constexpr = 256,
 ):
     pid_m = tle.program_id(0)
     pid_k = tle.program_id(1)
@@ -122,6 +121,7 @@ def forward(ctx, x, dim, dtype):
                 M,
                 N,
                 K,
+                num_warps=8,
             )
         ctx.save_for_backward(out)
         ctx.dim = dim
diff --git a/src/flag_gems/ops/mm.py b/src/flag_gems/ops/mm.py
@@ -6,7 +6,7 @@
 
 from .. import runtime
 from ..runtime import torch_device_fn
-from ..utils import libentry
+from ..utils import libentry, libtuner
 from ..utils import triton_lang_extension as tle
 
 
@@ -15,7 +15,7 @@ def heur_even_k(args):
 
 
 @libentry()
-@triton.autotune(
+@libtuner(
     configs=runtime.get_triton_config("mm"),
     key=["M", "N", "K"],
 )
diff --git a/src/flag_gems/ops/pad.py b/src/flag_gems/ops/pad.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from flag_gems.utils.code_cache import cache_dir
+from flag_gems.utils.code_cache import code_cache_dir
 from flag_gems.utils.code_utils import IndentedBuffer, NameSpace
 
 
@@ -424,7 +424,7 @@ def __call__(self, *args, **kwargs):
 
             file_name = f"constant_pad_rank_{key}_pid_{self.pid}.py"
 
-            with open(cache_dir() / file_name, "wt", encoding="utf-8") as f:
+            with open(code_cache_dir() / file_name, "wt", encoding="utf-8") as f:
                 f.write(code.getvalue())
 
             # load
diff --git a/src/flag_gems/ops/repeat.py b/src/flag_gems/ops/repeat.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from flag_gems.utils.code_cache import cache_dir
+from flag_gems.utils.code_cache import code_cache_dir
 from flag_gems.utils.code_utils import IndentedBuffer, NameSpace
 
 
@@ -437,7 +437,7 @@ def __call__(self, x, sizes):
 
             file_name = f"repeat_rank_{key}_pid_{self.pid}.py"
 
-            with open(cache_dir() / file_name, "wt", encoding="utf-8") as f:
+            with open(code_cache_dir() / file_name, "wt", encoding="utf-8") as f:
                 f.write(code.getvalue())
 
             # load
diff --git a/src/flag_gems/ops/scatter.py b/src/flag_gems/ops/scatter.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from flag_gems.utils.code_cache import cache_dir
+from flag_gems.utils.code_cache import code_cache_dir
 from flag_gems.utils.code_utils import IndentedBuffer, NameSpace
 
 
@@ -248,7 +248,7 @@ def __call__(self, *args, **kwargs):
 
             file_name = f"scatter_rank_{key}_pid_{self.pid}.py"
 
-            with open(cache_dir() / file_name, "wt", encoding="utf-8") as f:
+            with open(code_cache_dir() / file_name, "wt", encoding="utf-8") as f:
                 f.write(code.getvalue())
 
             # load
diff --git a/src/flag_gems/ops/tile.py b/src/flag_gems/ops/tile.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from flag_gems.utils.code_cache import cache_dir
+from flag_gems.utils.code_cache import code_cache_dir
 from flag_gems.utils.code_utils import IndentedBuffer, NameSpace
 
 
@@ -437,7 +437,7 @@ def __call__(self, x, dims):
 
             file_name = f"tile_rank_{key}_pid_{self.pid}.py"
 
-            with open(cache_dir() / file_name, "wt", encoding="utf-8") as f:
+            with open(code_cache_dir() / file_name, "wt", encoding="utf-8") as f:
                 f.write(code.getvalue())
 
             # load
diff --git a/src/flag_gems/utils/__init__.py b/src/flag_gems/utils/__init__.py
@@ -1,4 +1,4 @@
-from .libentry import libentry
+from .libentry import libentry, libtuner
 from .pointwise_dynamic import pointwise_dynamic
 from .shape_utils import (
     broadcastable,
@@ -10,6 +10,7 @@
 
 __all__ = [
     "libentry",
+    "libtuner",
     "pointwise_dynamic",
     "dim_compress",
     "restride_dim",
diff --git a/src/flag_gems/utils/code_cache.py b/src/flag_gems/utils/code_cache.py
@@ -22,6 +22,18 @@ def cache_dir() -> Path:
     return _cache_dir
 
 
+def code_cache_dir() -> Path:
+    _code_cache_dir = cache_dir() / "code_cache"
+    os.makedirs(_code_cache_dir, exist_ok=True)
+    return _code_cache_dir
+
+
+def config_cache_dir() -> Path:
+    _config_cache_dir = cache_dir() / "config_cache"
+    os.makedirs(_config_cache_dir, exist_ok=True)
+    return _config_cache_dir
+
+
 def clear_cache():
     """Clear the cache directory for code cache."""
     _cache_dir = cache_dir_path()
diff --git a/src/flag_gems/utils/libentry.py b/src/flag_gems/utils/libentry.py
diff --git a/src/flag_gems/utils/pointwise_dynamic.py b/src/flag_gems/utils/pointwise_dynamic.py