diff --git a/Report.md b/Report.md new file mode 100644 index 000000000..bf43cabc3 --- /dev/null +++ b/Report.md @@ -0,0 +1,228 @@ +## 项目 #2:GPU 集成 + +### 1. 架构设计 + +本次实现没有改动 LLAISYS 的整体执行框架,只在现有 `device -> ops -> model` 链路中插入 GPU 后端。 + +```text +Python API / Test + | + v + LLAISYS C API + | + v + Runtime / Tensor / Model + | + +-------------------+-------------------+ + | | | + v v v + CPU NVIDIA GPU MetaX GPU + | | + v v + src/device/nvidia/ src/device/metax/ + | | + +---------+---------+ + | + v + 算子分发 src/ops//op.cpp + | + +-----------------+-----------------+ + | | + v v + src/ops//nvidia/*.cu src/ops//metax/*.maca + | + v + 复用 ../nvidia/*.cu 算子主体 +``` + +- 设备层: + - `src/device/nvidia/` 实现 NVIDIA Runtime API 与设备资源管理 + - `src/device/metax/` 实现 MetaX Runtime API 入口 +- 算子层: + - `src/ops/*/nvidia/` 实现 CUDA 算子 + - `src/ops/*/metax/` 作为 MetaX 编译入口 +- 构建层: + - `xmake/nvidia.lua` 管理 CUDA/NVCC 编译 + - `xmake/metax.lua` 管理 MACA/MXCC 编译 + +核心设计是“平台分离、算子复用”: + +- NVIDIA 路径使用原生 CUDA 构建与 runtime +- MetaX 路径单独提供设备枚举、构建规则和 runtime 分发 +- MetaX 不重写整套算子,而是通过 `.maca` 入口复用 `nvidia/*.cu` 中的 CUDA-like 算子主体 + +因此,框架层面是两条独立 GPU 后端;算子源码层面只维护一套主实现。 + +### 2. 实现步骤 + +#### 2.1 NVIDIA 后端 + +第一步是补全 NVIDIA Runtime API,对齐 CPU Runtime 接口,包括: + +- device count / set device +- malloc / free +- memcpy +- synchronize + +随后在 `src/device/runtime_api.cpp` 中注册 NVIDIA runtime,使上层 `Tensor`、`RuntimeAPI` 和模型代码可以直接使用 GPU 设备。 + +第二步是接入 CUDA 构建链: + +- 在 `xmake/nvidia.lua` 中加入 `.cu` 编译与链接规则 +- 通过 `--nv-gpu=y` 控制是否启用 GPU 编译 + +第三步是补全 CUDA 算子。实现上采用统一模式: + +- 每个算子在 `src/ops//nvidia/` 中提供 host 入口 +- host 入口完成 dtype 分派、launch 配置与错误检查 +- 计算逻辑写在模板化 kernel 中 + +实现重点在两个热点算子: + +- `linear` + - 采用“一线程对应一个输出元素”的映射 + - `fp16/bf16` 先转 `float` 再累加 +- `self_attention` + - 采用二维 grid,按 `(query, head)` 映射 block + - 在 block 内完成 score 计算、softmax 和 value 加权 + - `scores` 使用 shared memory 存储 + +其余算子如 `add`、`rope`、`rms_norm`、`swiglu`、`embedding`、`argmax`、`rearrange` 按相同方式补齐,形成完整推理执行链。 + +#### 2.2 MetaX 后端 + +MetaX 的实现重点不在重新设计算子,而在接入新的设备路径。 + +实现步骤如下: + +1. 新增 `ENABLE_METAX_API` +2. 新增 `LLAISYS_DEVICE_METAX` 与 Python 侧 `DeviceType.METAX` +3. 在 `runtime_api.cpp` 中加入 MetaX runtime 分发 +4. 新增 `xmake/metax.lua`,使用 `mxcc` 编译 `.maca` +5. 为每个算子添加 `src/ops/*/metax/*.maca` 入口 +6. 在 `.maca` 中复用 `../nvidia/*.cu` 算子主体 + +这样实现后,MetaX 具备独立设备语义,但不引入第二套重复算子实现。 +这一点是本次适配的关键取舍。 + +### 3. 测试 + +测试分两层进行。 + +#### 3.1 单算子测试 + +先逐个验证 GPU 算子: + +```bash +python test/test_runtime.py --device nvidia +python test/ops/add.py --device nvidia +python test/ops/argmax.py --device nvidia +python test/ops/embedding.py --device nvidia +python test/ops/linear.py --device nvidia +python test/ops/rms_norm.py --device nvidia +python test/ops/rope.py --device nvidia +python test/ops/self_attention.py --device nvidia +python test/ops/swiglu.py --device nvidia +``` + +MetaX 路径使用同样方法,设备改为 `metax`。 +这样可以先验证 Runtime、dtype 分派和单算子正确性,再进入整模型测试。 + +#### 3.2 端到端推理测试 + +最终使用 `test/test_infer.py --test` 验证整条执行链。判断标准不是只看程序是否运行,而是: + +- 生成 token 是否与参考一致 +- 文本输出是否一致 +- 测试是否通过 + +Nvidia推理测试结果如下: +``` +(base) machine@dsw-607126-85f54bdf75-5lzlx:~/llaisys$ python test/test_infer.py --model ../models/DeepSeek-R1-Distill-Qwen-1.5B/ --test --device nvidia +`torch_dtype` is deprecated! Use `dtype` instead! +Loading model from local path: ../models/DeepSeek-R1-Distill-Qwen-1.5B/ +Loading weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 339/339 [00:03<00:00, 95.80it/s] +The module name (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues. +The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. +Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation. +The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + +=== Answer === + +Tokens: +[151646, 151644, 15191, 525, 498, 30, 151645, 151648, 198, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 624, 151649, 271, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 13, 151643] + +Contents: +<|User|>Who are you?<|Assistant|> +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Time elapsed: 9.36s + + +=== Your Result === + +Tokens: +[151646, 151644, 15191, 525, 498, 30, 151645, 151648, 198, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 624, 151649, 271, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 13, 151643] + +Contents: +<|User|>Who are you?<|Assistant|> +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Time elapsed: 83.64s + +Test passed! +``` + +曦云 C500推理结果如下: +``` +(base) root@d3871d5ad673:/home/machine/llaisys# python test/test_infer.py --test --device metax +Loading model from Hugging Face: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +Fetching 9 files: 100%|████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 102023.61it/s] +`torch_dtype` is deprecated! Use `dtype` instead! +The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. +Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation. +The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. +/opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:5912: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp:649.) + return _scaled_dot_product_attention(query, key, value, attn_mask, dropout_p, is_causal, scale = scale, enable_gqa = enable_gqa) + +=== Answer === + +Tokens: +[151646, 151646, 151644, 15191, 525, 498, 30, 151645, 151648, 198, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 624, 151649, 271, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 13, 151643] + +Contents: +<|User|>Who are you?<|Assistant|> +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Time elapsed: 2.85s + + +=== Your Result === + +Tokens: +[151646, 151646, 151644, 15191, 525, 498, 30, 151645, 151648, 198, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 624, 151649, 271, 91786, 0, 358, 2776, 18183, 39350, 10911, 16, 11, 458, 20443, 11229, 17847, 3465, 553, 18183, 39350, 13, 358, 2776, 518, 697, 2473, 323, 1035, 387, 33972, 311, 7789, 498, 448, 894, 43883, 476, 9079, 498, 1231, 614, 13, 151643] + +Contents: +<|User|>Who are you?<|Assistant|> +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have. + + +Time elapsed: 31.70s + +Test passed! +``` diff --git a/include/llaisys.h b/include/llaisys.h index 73ca7eead..ca9f03184 100644 --- a/include/llaisys.h +++ b/include/llaisys.h @@ -24,6 +24,7 @@ typedef enum { LLAISYS_DEVICE_CPU = 0, //// TODO: Add more device types here. Numbers need to be consecutive. LLAISYS_DEVICE_NVIDIA = 1, + LLAISYS_DEVICE_METAX = 2, LLAISYS_DEVICE_TYPE_COUNT } llaisysDeviceType_t; diff --git a/include/llaisys/qwen2.h b/include/llaisys/qwen2.h new file mode 100644 index 000000000..f47da9ff0 --- /dev/null +++ b/include/llaisys/qwen2.h @@ -0,0 +1,62 @@ +#ifndef LLAISYS_QWEN2_H +#define LLAISYS_QWEN2_H + +#include "../llaisys.h" +#include "tensor.h" + +__C { + typedef struct LlaisysQwen2Model *llaisysQwen2Model_t; + + struct LlaisysQwen2Meta { + size_t nlayer; // num_hidden_layers + size_t hs; // hidden_size + size_t nh; // num_attention_heads + size_t nkvh; // num_key_value_heads + size_t dh; // head_dim = hs / nh + size_t di; // intermediate_size + size_t maxseq; // max_position_embeddings + size_t voc; // vocab_size + float epsilon; // rms_norm_eps + float theta; // rope_theta + int64_t end_token; // eos_token_id + }; + + struct LlaisysQwen2Weights { + llaisysTensor_t in_embed; // [voc, hs] + llaisysTensor_t out_embed; // [voc, hs] + llaisysTensor_t out_norm_w; // [hs] + + // Per-layer weights (arrays of size nlayer) + llaisysTensor_t* attn_norm_w; // [nlayer][hs] + llaisysTensor_t* attn_q_w; // [nlayer][nh*dh, hs] + llaisysTensor_t* attn_q_b; // [nlayer][nh*dh] + llaisysTensor_t* attn_k_w; // [nlayer][nkvh*dh, hs] + llaisysTensor_t* attn_k_b; // [nlayer][nkvh*dh] + llaisysTensor_t* attn_v_w; // [nlayer][nkvh*dh, hs] + llaisysTensor_t* attn_v_b; // [nlayer][nkvh*dh] + llaisysTensor_t* attn_o_w; // [nlayer][hs, nh*dh] + + llaisysTensor_t* mlp_norm_w; // [nlayer][hs] + llaisysTensor_t* mlp_gate_w; // [nlayer][di, hs] + llaisysTensor_t* mlp_up_w; // [nlayer][di, hs] + llaisysTensor_t* mlp_down_w; // [nlayer][hs, di] + }; + + __export llaisysQwen2Model_t llaisysQwen2ModelCreate( + const struct LlaisysQwen2Meta* meta, + llaisysDeviceType_t device_type, + int device_id); + + __export void llaisysQwen2ModelDestroy(llaisysQwen2Model_t model); + + __export struct LlaisysQwen2Weights* llaisysQwen2ModelWeights(llaisysQwen2Model_t model); + + __export int64_t llaisysQwen2ModelInfer( + llaisysQwen2Model_t model, + int64_t* token_ids, + size_t ntoken); + + __export void llaisysQwen2ModelResetCache(llaisysQwen2Model_t model); +} + +#endif // LLAISYS_QWEN2_H diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb527..ac79f9b2d 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -12,6 +12,8 @@ from .tensor import llaisysTensor_t from .tensor import load_tensor from .ops import load_ops +from .models.qwen2 import load_qwen2 +from .models.qwen2 import LlaisysQwen2Meta, LlaisysQwen2Weights def load_shared_library(): @@ -22,7 +24,7 @@ def load_shared_library(): elif sys.platform == "win32": libname = "llaisys.dll" elif sys.platform == "darwin": - libname = "llaisys.dylib" + libname = "libllaisys.dylib" else: raise RuntimeError("Unsupported platform") @@ -38,6 +40,7 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) +load_qwen2(LIB_LLAISYS) __all__ = [ @@ -52,4 +55,6 @@ def load_shared_library(): "llaisysMemcpyKind_t", "MemcpyKind", "llaisysStream_t", + "LlaisysQwen2Meta", + "LlaisysQwen2Weights", ] diff --git a/python/llaisys/libllaisys/llaisys_types.py b/python/llaisys/libllaisys/llaisys_types.py index c5a0b4679..cbe92132e 100644 --- a/python/llaisys/libllaisys/llaisys_types.py +++ b/python/llaisys/libllaisys/llaisys_types.py @@ -6,7 +6,8 @@ class DeviceType(IntEnum): CPU = 0 NVIDIA = 1 - COUNT = 2 + METAX = 2 + COUNT = 3 llaisysDeviceType_t = ctypes.c_int diff --git a/python/llaisys/libllaisys/models/__init__.py b/python/llaisys/libllaisys/models/__init__.py new file mode 100644 index 000000000..03abae6e0 --- /dev/null +++ b/python/llaisys/libllaisys/models/__init__.py @@ -0,0 +1,7 @@ +from .qwen2 import load_qwen2, LlaisysQwen2Meta, LlaisysQwen2Weights + +__all__ = [ + "load_qwen2", + "LlaisysQwen2Meta", + "LlaisysQwen2Weights", +] diff --git a/python/llaisys/libllaisys/models/qwen2.py b/python/llaisys/libllaisys/models/qwen2.py new file mode 100644 index 000000000..10e746537 --- /dev/null +++ b/python/llaisys/libllaisys/models/qwen2.py @@ -0,0 +1,75 @@ +import ctypes +from ctypes import POINTER, c_void_p, c_size_t, c_int, c_int64, c_float, Structure + +from ..llaisys_types import llaisysDeviceType_t +from ..tensor import llaisysTensor_t + + +class LlaisysQwen2Meta(Structure): + _fields_ = [ + ("nlayer", c_size_t), + ("hs", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("maxseq", c_size_t), + ("voc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_int64), + ] + + +class LlaisysQwen2Weights(Structure): + _fields_ = [ + ("in_embed", llaisysTensor_t), + ("out_embed", llaisysTensor_t), + ("out_norm_w", llaisysTensor_t), + ("attn_norm_w", POINTER(llaisysTensor_t)), + ("attn_q_w", POINTER(llaisysTensor_t)), + ("attn_q_b", POINTER(llaisysTensor_t)), + ("attn_k_w", POINTER(llaisysTensor_t)), + ("attn_k_b", POINTER(llaisysTensor_t)), + ("attn_v_w", POINTER(llaisysTensor_t)), + ("attn_v_b", POINTER(llaisysTensor_t)), + ("attn_o_w", POINTER(llaisysTensor_t)), + ("mlp_norm_w", POINTER(llaisysTensor_t)), + ("mlp_gate_w", POINTER(llaisysTensor_t)), + ("mlp_up_w", POINTER(llaisysTensor_t)), + ("mlp_down_w", POINTER(llaisysTensor_t)), + ] + + +# Model handle type +llaisysQwen2Model_t = c_void_p + + +def load_qwen2(lib): + # llaisysQwen2ModelCreate + lib.llaisysQwen2ModelCreate.argtypes = [ + POINTER(LlaisysQwen2Meta), + llaisysDeviceType_t, + c_int, + ] + lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t + + # llaisysQwen2ModelDestroy + lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelDestroy.restype = None + + # llaisysQwen2ModelWeights + lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights) + + # llaisysQwen2ModelInfer + lib.llaisysQwen2ModelInfer.argtypes = [ + llaisysQwen2Model_t, + POINTER(c_int64), + c_size_t, + ] + lib.llaisysQwen2ModelInfer.restype = c_int64 + + # llaisysQwen2ModelResetCache + lib.llaisysQwen2ModelResetCache.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelResetCache.restype = None diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b21..c6f88d8a2 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,23 +1,109 @@ from typing import Sequence from ..libllaisys import LIB_LLAISYS from ..libllaisys import DeviceType +from ..libllaisys import LlaisysQwen2Meta, LlaisysQwen2Weights from pathlib import Path -import safetensors +import json +import ctypes +import safetensors.torch +import torch class Qwen2: - def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor - + def __init__(self, model_path, device: DeviceType = DeviceType.CPU, device_id: int = 0): model_path = Path(model_path) + # Read config.json + config_path = model_path / "config.json" + with open(config_path, "r") as f: + config = json.load(f) + + # Extract model parameters + nlayer = config["num_hidden_layers"] + hs = config["hidden_size"] + nh = config["num_attention_heads"] + nkvh = config["num_key_value_heads"] + dh = hs // nh + di = config["intermediate_size"] + maxseq = config["max_position_embeddings"] + voc = config["vocab_size"] + epsilon = config["rms_norm_eps"] + theta = config.get("rope_theta", 10000.0) + end_token = config["eos_token_id"] + + # Create meta structure + self._meta = LlaisysQwen2Meta( + nlayer=nlayer, + hs=hs, + nh=nh, + nkvh=nkvh, + dh=dh, + di=di, + maxseq=maxseq, + voc=voc, + epsilon=epsilon, + theta=theta, + end_token=end_token, + ) + + self._device = device + self._device_id = device_id + self._nlayer = nlayer + self._end_token = end_token + + # Create model + self._model = LIB_LLAISYS.llaisysQwen2ModelCreate( + ctypes.byref(self._meta), + device, + device_id, + ) + + # Get weights pointer + self._weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model) + + # Load weights from safetensors + self._load_weights(model_path) + + def _load_weights(self, model_path: Path): + weights = self._weights.contents + + # Weight name mapping + weight_map = { + "model.embed_tokens.weight": weights.in_embed, + "lm_head.weight": weights.out_embed, + "model.norm.weight": weights.out_norm_w, + } + + # Per-layer weight mapping + for i in range(self._nlayer): + weight_map[f"model.layers.{i}.input_layernorm.weight"] = weights.attn_norm_w[i] + weight_map[f"model.layers.{i}.self_attn.q_proj.weight"] = weights.attn_q_w[i] + weight_map[f"model.layers.{i}.self_attn.q_proj.bias"] = weights.attn_q_b[i] + weight_map[f"model.layers.{i}.self_attn.k_proj.weight"] = weights.attn_k_w[i] + weight_map[f"model.layers.{i}.self_attn.k_proj.bias"] = weights.attn_k_b[i] + weight_map[f"model.layers.{i}.self_attn.v_proj.weight"] = weights.attn_v_w[i] + weight_map[f"model.layers.{i}.self_attn.v_proj.bias"] = weights.attn_v_b[i] + weight_map[f"model.layers.{i}.self_attn.o_proj.weight"] = weights.attn_o_w[i] + weight_map[f"model.layers.{i}.post_attention_layernorm.weight"] = weights.mlp_norm_w[i] + weight_map[f"model.layers.{i}.mlp.gate_proj.weight"] = weights.mlp_gate_w[i] + weight_map[f"model.layers.{i}.mlp.up_proj.weight"] = weights.mlp_up_w[i] + weight_map[f"model.layers.{i}.mlp.down_proj.weight"] = weights.mlp_down_w[i] + + # Load from safetensors files for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") - for name_ in data_.keys(): - ## TODO: load the model weights - pass + data = safetensors.torch.load_file(file, device="cpu") + for name, tensor_data in data.items(): + if name in weight_map: + tensor_handle = weight_map[name] + # Ensure tensor is contiguous and get raw data pointer + tensor_data = tensor_data.contiguous() + # Load data into tensor + LIB_LLAISYS.tensorLoad( + tensor_handle, + ctypes.c_void_p(tensor_data.data_ptr()), + ) def generate( self, @@ -27,7 +113,40 @@ def generate( top_p: float = 0.8, temperature: float = 0.8, ): + # Reset KV cache for new generation + LIB_LLAISYS.llaisysQwen2ModelResetCache(self._model) + + # Convert inputs to ctypes array + input_list = list(inputs) + outputs = input_list.copy() + + # First inference with all input tokens + input_array = (ctypes.c_int64 * len(input_list))(*input_list) + next_token = LIB_LLAISYS.llaisysQwen2ModelInfer( + self._model, + input_array, + len(input_list), + ) + outputs.append(next_token) + + # Continue generating + tokens_generated = 1 + while max_new_tokens is None or tokens_generated < max_new_tokens: + if next_token == self._end_token: + break + + # Single token inference + single_token = (ctypes.c_int64 * 1)(next_token) + next_token = LIB_LLAISYS.llaisysQwen2ModelInfer( + self._model, + single_token, + 1, + ) + outputs.append(next_token) + tokens_generated += 1 - # TODO: Implement generate function + return outputs - return [] + def __del__(self): + if hasattr(self, "_model") and self._model: + LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model) diff --git a/src/device/metax/metax_runtime_api.maca b/src/device/metax/metax_runtime_api.maca new file mode 100644 index 000000000..94432ab35 --- /dev/null +++ b/src/device/metax/metax_runtime_api.maca @@ -0,0 +1,9 @@ +#include "../nvidia/nvidia_runtime_api.cu" + +namespace llaisys::device::metax { + +const LlaisysRuntimeAPI *getRuntimeAPI() { + return nvidia::getRuntimeAPI(); +} + +} // namespace llaisys::device::metax diff --git a/src/device/nvidia/nvidia_resource.cu b/src/device/nvidia/nvidia_resource.cu index 2e63647e5..31df34f26 100644 --- a/src/device/nvidia/nvidia_resource.cu +++ b/src/device/nvidia/nvidia_resource.cu @@ -1,7 +1,57 @@ #include "nvidia_resource.cuh" +#include +#include +#include + namespace llaisys::device::nvidia { -Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {} +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::fprintf(stderr, "[CUDA ERROR] %s:%d: %s\n", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + } \ + } while (0) + +#define CUBLAS_CHECK(call) \ + do { \ + cublasStatus_t status = call; \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + std::fprintf(stderr, "[cuBLAS ERROR] %s:%d: status=%d\n", __FILE__, \ + __LINE__, status); \ + } \ + } while (0) + +Resource::Resource(int device_id) + : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {} + +Resource::~Resource() { + if (_cublas_handle != nullptr) { + CUBLAS_CHECK(cublasDestroy(_cublas_handle)); + _cublas_handle = nullptr; + } + _initialized = false; +} + +void Resource::init() { + if (_initialized) return; + + // Set device before creating handles + CUDA_CHECK(cudaSetDevice(getDeviceId())); + + // Create cuBLAS handle + CUBLAS_CHECK(cublasCreate(&_cublas_handle)); + + _initialized = true; +} + +cublasHandle_t Resource::cublasHandle() { + if (!_initialized) { + init(); + } + return _cublas_handle; +} } // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_resource.cuh b/src/device/nvidia/nvidia_resource.cuh index a3002170b..5744fd7a1 100644 --- a/src/device/nvidia/nvidia_resource.cuh +++ b/src/device/nvidia/nvidia_resource.cuh @@ -2,10 +2,31 @@ #include "../device_resource.hpp" +// Forward declarations for CUDA types +typedef struct cublasContext *cublasHandle_t; + namespace llaisys::device::nvidia { + +// NVIDIA-specific device resources +// Each device (GPU) has its own set of handles for library calls class Resource : public llaisys::device::DeviceResource { +private: + cublasHandle_t _cublas_handle = nullptr; + bool _initialized = false; + public: Resource(int device_id); ~Resource(); + + // Initialize resources (lazy initialization) + void init(); + + // Get cuBLAS handle ( initializes if needed) + cublasHandle_t cublasHandle(); + + // Prevent copying + Resource(const Resource&) = delete; + Resource& operator=(const Resource&) = delete; }; + } // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu index cab928261..92151ef10 100644 --- a/src/device/nvidia/nvidia_runtime_api.cu +++ b/src/device/nvidia/nvidia_runtime_api.cu @@ -1,56 +1,102 @@ #include "../runtime_api.hpp" -#include -#include +#include +#include +#include namespace llaisys::device::nvidia { namespace runtime_api { + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::fprintf(stderr, "[CUDA ERROR] %s:%d: %s\n", __FILE__, \ + __LINE__, cudaGetErrorString(err)); \ + throw std::runtime_error(cudaGetErrorString(err)); \ + } \ + } while (0) + int getDeviceCount() { - TO_BE_IMPLEMENTED(); + int count = 0; + cudaError_t err = cudaGetDeviceCount(&count); + if (err != cudaSuccess) { + return 0; // No CUDA devices available + } + return count; } -void setDevice(int) { - TO_BE_IMPLEMENTED(); +void setDevice(int device_id) { + CUDA_CHECK(cudaSetDevice(device_id)); } void deviceSynchronize() { - TO_BE_IMPLEMENTED(); + CUDA_CHECK(cudaDeviceSynchronize()); } llaisysStream_t createStream() { - TO_BE_IMPLEMENTED(); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + return static_cast(stream); } void destroyStream(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + if (stream != nullptr) { + CUDA_CHECK(cudaStreamDestroy(static_cast(stream))); + } } + void streamSynchronize(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + CUDA_CHECK(cudaStreamSynchronize(static_cast(stream))); } void *mallocDevice(size_t size) { - TO_BE_IMPLEMENTED(); + void *ptr = nullptr; + CUDA_CHECK(cudaMalloc(&ptr, size)); + return ptr; } void freeDevice(void *ptr) { - TO_BE_IMPLEMENTED(); + if (ptr != nullptr) { + CUDA_CHECK(cudaFree(ptr)); + } } void *mallocHost(size_t size) { - TO_BE_IMPLEMENTED(); + void *ptr = nullptr; + CUDA_CHECK(cudaMallocHost(&ptr, size)); + return ptr; } void freeHost(void *ptr) { - TO_BE_IMPLEMENTED(); + if (ptr != nullptr) { + CUDA_CHECK(cudaFreeHost(ptr)); + } +} + +static cudaMemcpyKind toCudaMemcpyKind(llaisysMemcpyKind_t kind) { + switch (kind) { + case LLAISYS_MEMCPY_H2H: + return cudaMemcpyHostToHost; + case LLAISYS_MEMCPY_H2D: + return cudaMemcpyHostToDevice; + case LLAISYS_MEMCPY_D2H: + return cudaMemcpyDeviceToHost; + case LLAISYS_MEMCPY_D2D: + return cudaMemcpyDeviceToDevice; + default: + throw std::runtime_error("Unknown memcpy kind"); + } } void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); + CUDA_CHECK(cudaMemcpy(dst, src, size, toCudaMemcpyKind(kind))); } -void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); +void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) { + CUDA_CHECK(cudaMemcpyAsync(dst, src, size, toCudaMemcpyKind(kind), + static_cast(stream))); } static const LlaisysRuntimeAPI RUNTIME_API = { @@ -72,4 +118,5 @@ static const LlaisysRuntimeAPI RUNTIME_API = { const LlaisysRuntimeAPI *getRuntimeAPI() { return &runtime_api::RUNTIME_API; } + } // namespace llaisys::device::nvidia diff --git a/src/device/runtime_api.cpp b/src/device/runtime_api.cpp index 2de3eca02..233afa896 100644 --- a/src/device/runtime_api.cpp +++ b/src/device/runtime_api.cpp @@ -80,6 +80,12 @@ const LlaisysRuntimeAPI *getRuntimeAPI(llaisysDeviceType_t device_type) { return llaisys::device::nvidia::getRuntimeAPI(); #else return getUnsupportedRuntimeAPI(); +#endif + case LLAISYS_DEVICE_METAX: +#ifdef ENABLE_METAX_API + return llaisys::device::metax::getRuntimeAPI(); +#else + return getUnsupportedRuntimeAPI(); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/device/runtime_api.hpp b/src/device/runtime_api.hpp index e6b9f80d6..0e94644f5 100644 --- a/src/device/runtime_api.hpp +++ b/src/device/runtime_api.hpp @@ -17,4 +17,10 @@ namespace nvidia { const LlaisysRuntimeAPI *getRuntimeAPI(); } #endif + +#ifdef ENABLE_METAX_API +namespace metax { +const LlaisysRuntimeAPI *getRuntimeAPI(); +} +#endif } // namespace llaisys::device diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc new file mode 100644 index 000000000..c08382de7 --- /dev/null +++ b/src/llaisys/qwen2.cc @@ -0,0 +1,135 @@ +#include "llaisys/qwen2.h" + +#include "llaisys_tensor.hpp" +#include "../models/qwen2/qwen2.hpp" + +#include + +__C { + struct LlaisysQwen2Model { + llaisys::models::Qwen2Model* model; + LlaisysQwen2Weights weights_c; + size_t nlayer; + }; + + llaisysQwen2Model_t llaisysQwen2ModelCreate( + const struct LlaisysQwen2Meta* meta, + llaisysDeviceType_t device_type, + int device_id) { + + // Convert C meta to C++ meta + llaisys::models::Qwen2Meta cpp_meta; + cpp_meta.nlayer = meta->nlayer; + cpp_meta.hs = meta->hs; + cpp_meta.nh = meta->nh; + cpp_meta.nkvh = meta->nkvh; + cpp_meta.dh = meta->dh; + cpp_meta.di = meta->di; + cpp_meta.maxseq = meta->maxseq; + cpp_meta.voc = meta->voc; + cpp_meta.epsilon = meta->epsilon; + cpp_meta.theta = meta->theta; + cpp_meta.end_token = meta->end_token; + + auto* handle = new LlaisysQwen2Model(); + handle->model = new llaisys::models::Qwen2Model(&cpp_meta, device_type, device_id); + handle->nlayer = meta->nlayer; + + auto* cpp_weights = handle->model->weights(); + + // Allocate arrays for per-layer weights + size_t nlayer = meta->nlayer; + handle->weights_c.attn_norm_w = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_q_w = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_q_b = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_k_w = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_k_b = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_v_w = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_v_b = new llaisysTensor_t[nlayer]; + handle->weights_c.attn_o_w = new llaisysTensor_t[nlayer]; + handle->weights_c.mlp_norm_w = new llaisysTensor_t[nlayer]; + handle->weights_c.mlp_gate_w = new llaisysTensor_t[nlayer]; + handle->weights_c.mlp_up_w = new llaisysTensor_t[nlayer]; + handle->weights_c.mlp_down_w = new llaisysTensor_t[nlayer]; + + // Wrap embedding weights + handle->weights_c.in_embed = new LlaisysTensor{cpp_weights->in_embed}; + handle->weights_c.out_embed = new LlaisysTensor{cpp_weights->out_embed}; + handle->weights_c.out_norm_w = new LlaisysTensor{cpp_weights->out_norm_w}; + + // Wrap per-layer weights + for (size_t i = 0; i < nlayer; i++) { + handle->weights_c.attn_norm_w[i] = new LlaisysTensor{cpp_weights->attn_norm_w[i]}; + handle->weights_c.attn_q_w[i] = new LlaisysTensor{cpp_weights->attn_q_w[i]}; + handle->weights_c.attn_q_b[i] = new LlaisysTensor{cpp_weights->attn_q_b[i]}; + handle->weights_c.attn_k_w[i] = new LlaisysTensor{cpp_weights->attn_k_w[i]}; + handle->weights_c.attn_k_b[i] = new LlaisysTensor{cpp_weights->attn_k_b[i]}; + handle->weights_c.attn_v_w[i] = new LlaisysTensor{cpp_weights->attn_v_w[i]}; + handle->weights_c.attn_v_b[i] = new LlaisysTensor{cpp_weights->attn_v_b[i]}; + handle->weights_c.attn_o_w[i] = new LlaisysTensor{cpp_weights->attn_o_w[i]}; + handle->weights_c.mlp_norm_w[i] = new LlaisysTensor{cpp_weights->mlp_norm_w[i]}; + handle->weights_c.mlp_gate_w[i] = new LlaisysTensor{cpp_weights->mlp_gate_w[i]}; + handle->weights_c.mlp_up_w[i] = new LlaisysTensor{cpp_weights->mlp_up_w[i]}; + handle->weights_c.mlp_down_w[i] = new LlaisysTensor{cpp_weights->mlp_down_w[i]}; + } + + return handle; + } + + void llaisysQwen2ModelDestroy(llaisysQwen2Model_t model) { + if (!model) return; + + size_t nlayer = model->nlayer; + + // Delete wrapper tensors (not the underlying tensors, they're managed by C++) + delete model->weights_c.in_embed; + delete model->weights_c.out_embed; + delete model->weights_c.out_norm_w; + + for (size_t i = 0; i < nlayer; i++) { + delete model->weights_c.attn_norm_w[i]; + delete model->weights_c.attn_q_w[i]; + delete model->weights_c.attn_q_b[i]; + delete model->weights_c.attn_k_w[i]; + delete model->weights_c.attn_k_b[i]; + delete model->weights_c.attn_v_w[i]; + delete model->weights_c.attn_v_b[i]; + delete model->weights_c.attn_o_w[i]; + delete model->weights_c.mlp_norm_w[i]; + delete model->weights_c.mlp_gate_w[i]; + delete model->weights_c.mlp_up_w[i]; + delete model->weights_c.mlp_down_w[i]; + } + + delete[] model->weights_c.attn_norm_w; + delete[] model->weights_c.attn_q_w; + delete[] model->weights_c.attn_q_b; + delete[] model->weights_c.attn_k_w; + delete[] model->weights_c.attn_k_b; + delete[] model->weights_c.attn_v_w; + delete[] model->weights_c.attn_v_b; + delete[] model->weights_c.attn_o_w; + delete[] model->weights_c.mlp_norm_w; + delete[] model->weights_c.mlp_gate_w; + delete[] model->weights_c.mlp_up_w; + delete[] model->weights_c.mlp_down_w; + + delete model->model; + delete model; + } + + struct LlaisysQwen2Weights* llaisysQwen2ModelWeights(llaisysQwen2Model_t model) { + return &model->weights_c; + } + + int64_t llaisysQwen2ModelInfer( + llaisysQwen2Model_t model, + int64_t* token_ids, + size_t ntoken) { + return model->model->infer(token_ids, ntoken); + } + + void llaisysQwen2ModelResetCache(llaisysQwen2Model_t model) { + model->model->resetCache(); + } +} diff --git a/src/models/qwen2/qwen2.cpp b/src/models/qwen2/qwen2.cpp new file mode 100644 index 000000000..159e92397 --- /dev/null +++ b/src/models/qwen2/qwen2.cpp @@ -0,0 +1,249 @@ +#include "qwen2.hpp" + +#include "../../ops/add/op.hpp" +#include "../../ops/argmax/op.hpp" +#include "../../ops/embedding/op.hpp" +#include "../../ops/linear/op.hpp" +#include "../../ops/rearrange/op.hpp" +#include "../../ops/rms_norm/op.hpp" +#include "../../ops/rope/op.hpp" +#include "../../ops/self_attention/op.hpp" +#include "../../ops/swiglu/op.hpp" + +#include +#include + +namespace llaisys::models { + +Qwen2Model::Qwen2Model(const Qwen2Meta* meta, llaisysDeviceType_t device, int device_id) + : _meta(*meta), _device_type(device), _device_id(device_id), _cache_len(0), _max_batch(0) { + + // Set context to the correct device + core::context().setDevice(device, device_id); + + size_t nlayer = _meta.nlayer; + size_t hs = _meta.hs; + size_t nh = _meta.nh; + size_t nkvh = _meta.nkvh; + size_t dh = _meta.dh; + size_t di = _meta.di; + size_t maxseq = _meta.maxseq; + size_t voc = _meta.voc; + + // Allocate embedding weights + _weights.in_embed = Tensor::create({voc, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.out_embed = Tensor::create({voc, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.out_norm_w = Tensor::create({hs}, LLAISYS_DTYPE_BF16, device, device_id); + + // Allocate per-layer weights + _weights.attn_norm_w.resize(nlayer); + _weights.attn_q_w.resize(nlayer); + _weights.attn_q_b.resize(nlayer); + _weights.attn_k_w.resize(nlayer); + _weights.attn_k_b.resize(nlayer); + _weights.attn_v_w.resize(nlayer); + _weights.attn_v_b.resize(nlayer); + _weights.attn_o_w.resize(nlayer); + _weights.mlp_norm_w.resize(nlayer); + _weights.mlp_gate_w.resize(nlayer); + _weights.mlp_up_w.resize(nlayer); + _weights.mlp_down_w.resize(nlayer); + + for (size_t i = 0; i < nlayer; i++) { + _weights.attn_norm_w[i] = Tensor::create({hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_q_w[i] = Tensor::create({nh * dh, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_q_b[i] = Tensor::create({nh * dh}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_k_w[i] = Tensor::create({nkvh * dh, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_k_b[i] = Tensor::create({nkvh * dh}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_v_w[i] = Tensor::create({nkvh * dh, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_v_b[i] = Tensor::create({nkvh * dh}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.attn_o_w[i] = Tensor::create({hs, nh * dh}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.mlp_norm_w[i] = Tensor::create({hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.mlp_gate_w[i] = Tensor::create({di, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.mlp_up_w[i] = Tensor::create({di, hs}, LLAISYS_DTYPE_BF16, device, device_id); + _weights.mlp_down_w[i] = Tensor::create({hs, di}, LLAISYS_DTYPE_BF16, device, device_id); + } + + // Allocate KV cache + _k_cache.resize(nlayer); + _v_cache.resize(nlayer); + for (size_t i = 0; i < nlayer; i++) { + _k_cache[i] = Tensor::create({maxseq, nkvh, dh}, LLAISYS_DTYPE_BF16, device, device_id); + _v_cache[i] = Tensor::create({maxseq, nkvh, dh}, LLAISYS_DTYPE_BF16, device, device_id); + } + + // Allocate argmax output tensors + _max_idx = Tensor::create({1}, LLAISYS_DTYPE_I64, device, device_id); + _max_val = Tensor::create({1}, LLAISYS_DTYPE_BF16, device, device_id); +} + +void Qwen2Model::allocateBuffers(size_t batch_size) { + if (batch_size <= _max_batch) { + return; + } + + size_t hs = _meta.hs; + size_t nh = _meta.nh; + size_t nkvh = _meta.nkvh; + size_t dh = _meta.dh; + size_t di = _meta.di; + size_t voc = _meta.voc; + + _hidden = Tensor::create({batch_size, hs}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _residual = Tensor::create({batch_size, hs}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _norm_out = Tensor::create({batch_size, hs}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _q = Tensor::create({batch_size, nh * dh}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _k = Tensor::create({batch_size, nkvh * dh}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _v = Tensor::create({batch_size, nkvh * dh}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _attn_out = Tensor::create({batch_size, nh, dh}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _gate = Tensor::create({batch_size, di}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _up = Tensor::create({batch_size, di}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _mlp_out = Tensor::create({batch_size, hs}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _logits = Tensor::create({1, voc}, LLAISYS_DTYPE_BF16, _device_type, _device_id); + _pos_ids = Tensor::create({batch_size}, LLAISYS_DTYPE_I64, _device_type, _device_id); + + _max_batch = batch_size; +} + +Qwen2Weights* Qwen2Model::weights() { + return &_weights; +} + +const Qwen2Meta* Qwen2Model::meta() const { + return &_meta; +} + +void Qwen2Model::resetCache() { + _cache_len = 0; +} + +int64_t Qwen2Model::infer(int64_t* token_ids, size_t ntoken) { + // Set context to the correct device + core::context().setDevice(_device_type, _device_id); + + allocateBuffers(ntoken); + + size_t nlayer = _meta.nlayer; + size_t hs = _meta.hs; + size_t nh = _meta.nh; + size_t nkvh = _meta.nkvh; + size_t dh = _meta.dh; + float eps = _meta.epsilon; + float theta = _meta.theta; + float scale = 1.0f / std::sqrt(static_cast(dh)); + + // Create input tensor for token_ids + auto input_ids = Tensor::create({ntoken}, LLAISYS_DTYPE_I64, _device_type, _device_id); + input_ids->load(token_ids); + + // Create position_ids: [cache_len, cache_len+1, ..., cache_len+ntoken-1] + std::vector pos_data(ntoken); + for (size_t i = 0; i < ntoken; i++) { + pos_data[i] = static_cast(_cache_len + i); + } + auto pos_ids = _pos_ids->slice(0, 0, ntoken); + pos_ids->load(pos_data.data()); + + // Get views for current batch size + auto hidden = _hidden->slice(0, 0, ntoken); + auto residual = _residual->slice(0, 0, ntoken); + auto norm_out = _norm_out->slice(0, 0, ntoken); + auto q = _q->slice(0, 0, ntoken); + auto k = _k->slice(0, 0, ntoken); + auto v = _v->slice(0, 0, ntoken); + auto attn_out = _attn_out->slice(0, 0, ntoken); + auto gate = _gate->slice(0, 0, ntoken); + auto up = _up->slice(0, 0, ntoken); + + // 1. Embedding lookup + ops::embedding(hidden, input_ids, _weights.in_embed); + + // 2. Process each layer + for (size_t layer = 0; layer < nlayer; layer++) { + // Save residual + ops::rearrange(residual, hidden); + + // Attention block + // RMSNorm + ops::rms_norm(norm_out, hidden, _weights.attn_norm_w[layer], eps); + + // Q/K/V projections + ops::linear(q, norm_out, _weights.attn_q_w[layer], _weights.attn_q_b[layer]); + ops::linear(k, norm_out, _weights.attn_k_w[layer], _weights.attn_k_b[layer]); + ops::linear(v, norm_out, _weights.attn_v_w[layer], _weights.attn_v_b[layer]); + + // Reshape Q/K/V for attention: [ntoken, nh*dh] -> [ntoken, nh, dh] + auto q_view = q->view({ntoken, nh, dh}); + auto k_view = k->view({ntoken, nkvh, dh}); + auto v_view = v->view({ntoken, nkvh, dh}); + + // Apply RoPE + ops::rope(q_view, q_view, pos_ids, theta); + ops::rope(k_view, k_view, pos_ids, theta); + + // Update KV cache + auto k_cache_slice = _k_cache[layer]->slice(0, _cache_len, _cache_len + ntoken); + auto v_cache_slice = _v_cache[layer]->slice(0, _cache_len, _cache_len + ntoken); + ops::rearrange(k_cache_slice, k_view); + ops::rearrange(v_cache_slice, v_view); + + // Get full KV cache for attention + auto k_full = _k_cache[layer]->slice(0, 0, _cache_len + ntoken); + auto v_full = _v_cache[layer]->slice(0, 0, _cache_len + ntoken); + + // Self attention + ops::self_attention(attn_out, q_view, k_full, v_full, scale); + + // Reshape attention output: [ntoken, nh, dh] -> [ntoken, hs] + auto attn_out_flat = attn_out->view({ntoken, hs}); + + // Output projection + ops::linear(hidden, attn_out_flat, _weights.attn_o_w[layer], nullptr); + + // Add residual + ops::add(hidden, hidden, residual); + + // MLP block + // Save residual + ops::rearrange(residual, hidden); + + // RMSNorm + ops::rms_norm(norm_out, hidden, _weights.mlp_norm_w[layer], eps); + + // Gate and Up projections + ops::linear(gate, norm_out, _weights.mlp_gate_w[layer], nullptr); + ops::linear(up, norm_out, _weights.mlp_up_w[layer], nullptr); + + // SwiGLU + ops::swiglu(gate, gate, up); + + // Down projection + ops::linear(hidden, gate, _weights.mlp_down_w[layer], nullptr); + + // Add residual + ops::add(hidden, hidden, residual); + } + + // 3. Final RMSNorm + ops::rms_norm(hidden, hidden, _weights.out_norm_w, eps); + + // 4. Get last token's hidden state and compute logits + auto last_hidden = hidden->slice(0, ntoken - 1, ntoken); // [1, hs] + ops::linear(_logits, last_hidden, _weights.out_embed, nullptr); + + // 5. Argmax + ops::argmax(_max_idx, _max_val, _logits); + + // 6. Update cache length + _cache_len += ntoken; + + // 7. Read and return the result + int64_t next_token; + // Copy from device to host + auto& runtime = core::context().runtime(); + runtime.api()->memcpy_sync(&next_token, _max_idx->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H); + + return next_token; +} + +} // namespace llaisys::models diff --git a/src/models/qwen2/qwen2.hpp b/src/models/qwen2/qwen2.hpp new file mode 100644 index 000000000..418b7d92e --- /dev/null +++ b/src/models/qwen2/qwen2.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include "../../tensor/tensor.hpp" +#include "llaisys.h" + +#include + +namespace llaisys::models { + +struct Qwen2Meta { + size_t nlayer; // num_hidden_layers + size_t hs; // hidden_size + size_t nh; // num_attention_heads + size_t nkvh; // num_key_value_heads + size_t dh; // head_dim = hs / nh + size_t di; // intermediate_size + size_t maxseq; // max_position_embeddings + size_t voc; // vocab_size + float epsilon; // rms_norm_eps + float theta; // rope_theta + int64_t end_token; // eos_token_id +}; + +struct Qwen2Weights { + tensor_t in_embed; // [voc, hs] + tensor_t out_embed; // [voc, hs] + tensor_t out_norm_w; // [hs] + + // Per-layer weights + std::vector attn_norm_w; // [nlayer][hs] + std::vector attn_q_w; // [nlayer][nh*dh, hs] + std::vector attn_q_b; // [nlayer][nh*dh] + std::vector attn_k_w; // [nlayer][nkvh*dh, hs] + std::vector attn_k_b; // [nlayer][nkvh*dh] + std::vector attn_v_w; // [nlayer][nkvh*dh, hs] + std::vector attn_v_b; // [nlayer][nkvh*dh] + std::vector attn_o_w; // [nlayer][hs, nh*dh] + + std::vector mlp_norm_w; // [nlayer][hs] + std::vector mlp_gate_w; // [nlayer][di, hs] + std::vector mlp_up_w; // [nlayer][di, hs] + std::vector mlp_down_w; // [nlayer][hs, di] +}; + +class Qwen2Model { +private: + Qwen2Meta _meta; + llaisysDeviceType_t _device_type; + int _device_id; + + Qwen2Weights _weights; + + // KV Cache: [nlayer] + std::vector _k_cache; // eaaxseq, nkvh, dh] + std::vector _v_cache; // each: [maxseq, nkvh, dh] + size_t _cache_len; + + // Intermediate buffers + tensor_t _hidden; // [seq, hs] + tensor_t _residual; // [seq, hs] + tensor_t _norm_out; // [seq, hs] + tensor_t _q; // [seq, nh*dh] + tensor_t _k; // [seq, nkvh*dh] + tensor_t _v; // [seq, nkvh*dh] + tensor_t _attn_out; // [seq, nh, dh] + tensor_t _gate; // [seq, di] + tensor_t _up; // [seq, di] + tensor_t _mlp_out; // [seq, hs] + tensor_t _logits; // [1, voc] + tensor_t _pos_ids; // [seq] + tensor_t _max_idx; // [1] + tensor_t _max_val; // [1] + + size_t _max_batch; // Maximum batch size for intermediate buffers + + void allocateBuffers(size_t batch_size); + +public: + Qwen2Model(const Qwen2Meta* meta, llaisysDeviceType_t device, int device_id); + ~Qwen2Model() = default; + + Qwen2Weights* weights(); + const Qwen2Meta* meta() const; + + int64_t infer(int64_t* token_ids, size_t ntoken); + void resetCache(); +}; + +} // namespace llaisys::models diff --git a/src/ops/add/metax/add_metax.cuh b/src/ops/add/metax/add_metax.cuh new file mode 100644 index 000000000..7d90dd208 --- /dev/null +++ b/src/ops/add/metax/add_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/add_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/add/metax/add_metax.maca b/src/ops/add/metax/add_metax.maca new file mode 100644 index 000000000..b925b2f68 --- /dev/null +++ b/src/ops/add/metax/add_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/add_nvidia.cu" diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu new file mode 100644 index 000000000..c2ad43451 --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.cu @@ -0,0 +1,62 @@ +#include "add_nvidia.cuh" +#include "../../../utils.hpp" + +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + if constexpr (std::is_same_v || std::is_same_v) { + float fa = utils::cast_device(a[idx]); + float fb = utils::cast_device(b[idx]); + c[idx] = utils::cast_device(fa + fb); + } else { + c[idx] = a[idx] + b[idx]; + } + } +} + +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel) { + const int block_size = 256; + const int num_blocks = (numel + block_size - 1) / block_size; + + switch (type) { + case LLAISYS_DTYPE_F32: + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + case LLAISYS_DTYPE_F16: + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + case LLAISYS_DTYPE_BF16: + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA add: %d\n", type); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] add kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/add/nvidia/add_nvidia.cuh b/src/ops/add/nvidia/add_nvidia.cuh new file mode 100644 index 000000000..a2e9144ab --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.cuh @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t size); +} diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp index a057330d7..b9898f0d1 100644 --- a/src/ops/add/op.cpp +++ b/src/ops/add/op.cpp @@ -4,6 +4,11 @@ #include "../../utils.hpp" #include "cpu/add_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/add_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/add_metax.cuh" +#endif namespace llaisys::ops { void add(tensor_t c, tensor_t a, tensor_t b) { @@ -25,8 +30,10 @@ void add(tensor_t c, tensor_t a, tensor_t b) { return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp new file mode 100644 index 000000000..c632b7323 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.cpp @@ -0,0 +1,44 @@ +#include "argmax_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +void argmax_(int64_t *max_idx, T *max_val, const T *vals, size_t size) { + auto max = vals[0]; + size_t max_index = 0; + for (size_t i = 1; i < size; i++) { + if constexpr (std::is_same_v || std::is_same_v) { + if (llaisys::utils::cast(vals[i]) > llaisys::utils::cast(max)) { + max = vals[i]; + max_index = i; + } + } else { + if (vals[i] > max) { + max = vals[i]; + max_index = i; + } + } + } + max_val[0] = max; + max_idx[0] = static_cast(max_index); +} + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size) { + switch (type) { + case LLAISYS_DTYPE_F32: + return argmax_(reinterpret_cast(max_idx), reinterpret_cast(max_val), + reinterpret_cast(vals), size); + case LLAISYS_DTYPE_BF16: + return argmax_(reinterpret_cast(max_idx), reinterpret_cast(max_val), + reinterpret_cast(vals), size); + case LLAISYS_DTYPE_F16: + return argmax_(reinterpret_cast(max_idx), reinterpret_cast(max_val), + reinterpret_cast(vals), size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp new file mode 100644 index 000000000..06234bacf --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { + void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size); +} \ No newline at end of file diff --git a/src/ops/argmax/metax/argmax_metax.cuh b/src/ops/argmax/metax/argmax_metax.cuh new file mode 100644 index 000000000..f2a87fd9f --- /dev/null +++ b/src/ops/argmax/metax/argmax_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/argmax_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/argmax/metax/argmax_metax.maca b/src/ops/argmax/metax/argmax_metax.maca new file mode 100644 index 000000000..53844c206 --- /dev/null +++ b/src/ops/argmax/metax/argmax_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/argmax_nvidia.cu" diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu new file mode 100644 index 000000000..89e11b01a --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.cu @@ -0,0 +1,93 @@ +#include "argmax_nvidia.cuh" + +#include "../../../utils.hpp" + +#include + +#include +#include +#include + +namespace llaisys::ops::nvidia { + +struct argmax_pair_t { + float value; + int64_t index; +}; + +template +__global__ void argmax_kernel(int64_t *max_idx, T *max_val, const T *vals, size_t size) { + size_t tid = threadIdx.x; + float local_max = -FLT_MAX; + int64_t local_idx = 0; + + for (size_t i = tid; i < size; i += blockDim.x * gridDim.x) { + float value; + if constexpr (std::is_same_v || std::is_same_v) { + value = utils::cast_device(vals[i]); + } else { + value = vals[i]; + } + if (value > local_max) { + local_max = value; + local_idx = static_cast(i); + } + } + + extern __shared__ argmax_pair_t shared[]; + shared[tid] = {local_max, local_idx}; + __syncthreads(); + + for (size_t stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride && shared[tid + stride].value > shared[tid].value) { + shared[tid] = shared[tid + stride]; + } + __syncthreads(); + } + + if (tid == 0) { + max_idx[0] = shared[0].index; + max_val[0] = utils::cast_device(shared[0].value); + } +} + +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size) { + const int block_size = 256; + const int num_blocks = 1; + const size_t shared_mem = block_size * sizeof(argmax_pair_t); + + switch (type) { + case LLAISYS_DTYPE_F32: + argmax_kernel<<>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + size); + break; + case LLAISYS_DTYPE_F16: + argmax_kernel<<>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + size); + break; + case LLAISYS_DTYPE_BF16: + argmax_kernel<<>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + size); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA argmax: %d\n", type); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] argmax kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cuh b/src/ops/argmax/nvidia/argmax_nvidia.cuh new file mode 100644 index 000000000..f4ba25e17 --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.cuh @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size); +} diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d426..9bc95dc33 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,43 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/argmax_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/argmax_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/argmax_metax.cuh" +#endif + namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(max_idx, max_val, vals); + CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype()); + ASSERT(vals->isContiguous(), "Argmax: vals tensor must be contiguous."); + ASSERT(max_idx->shape() == std::vector{1}, "Argmax: max_idx tensor must have shape (1,)."); + ASSERT(max_val->shape() == std::vector{1}, "Argmax: max_val tensor must have shape (1,)."); + ASSERT(max_idx->dtype() == LLAISYS_DTYPE_I64, "Argmax: max_idx tensor must have dtype int64."); + + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); + } + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), + vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp new file mode 100644 index 000000000..236c049fe --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.cpp @@ -0,0 +1,35 @@ +#include "embedding_cpu.hpp" + +#include "../../../utils.hpp" +#include "llaisys.h" + +template +void embedding_(T *out, const int64_t *index, const T *weight, + size_t N, size_t D) { + for (size_t i = 0; i < N; ++i) { + size_t idx = index[i]; + for (size_t j = 0; j < D; ++j) { + out[i * D + j] = weight[idx * D + j]; + } + } +} + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, + llaisysDataType_t weight_dtype, + size_t N, size_t D) { + switch (weight_dtype) { + case LLAISYS_DTYPE_F32: { + return embedding_(reinterpret_cast(out), reinterpret_cast(index), reinterpret_cast(weight), N, D); + } + case LLAISYS_DTYPE_BF16: { + return embedding_(reinterpret_cast(out), reinterpret_cast(index), reinterpret_cast(weight), N, D); + } + case LLAISYS_DTYPE_F16: { + return embedding_(reinterpret_cast(out), reinterpret_cast(index), reinterpret_cast(weight), N, D); + } + default: + EXCEPTION_UNSUPPORTED_DATATYPE(weight_dtype); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp new file mode 100644 index 000000000..375fbff59 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, + llaisysDataType_t weight_dtype, + size_t N, size_t D); +} diff --git a/src/ops/embedding/metax/embedding_metax.cuh b/src/ops/embedding/metax/embedding_metax.cuh new file mode 100644 index 000000000..2637e5963 --- /dev/null +++ b/src/ops/embedding/metax/embedding_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/embedding_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/embedding/metax/embedding_metax.maca b/src/ops/embedding/metax/embedding_metax.maca new file mode 100644 index 000000000..24512800d --- /dev/null +++ b/src/ops/embedding/metax/embedding_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/embedding_nvidia.cu" diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu new file mode 100644 index 000000000..e5c8309cb --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.cu @@ -0,0 +1,66 @@ +#include "embedding_nvidia.cuh" + +#include "../../../utils.hpp" + +#include + +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void embedding_kernel(T *out, const int64_t *index, const T *weight, size_t N, size_t D) { + size_t out_idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t numel = N * D; + if (out_idx < numel) { + size_t row = out_idx / D; + size_t col = out_idx % D; + size_t weight_row = static_cast(index[row]); + out[out_idx] = weight[weight_row * D + col]; + } +} + +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, + llaisysDataType_t weight_dtype, + size_t N, size_t D) { + const int block_size = 256; + const size_t numel = N * D; + const int num_blocks = static_cast((numel + block_size - 1) / block_size); + + switch (weight_dtype) { + case LLAISYS_DTYPE_F32: + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + N, D); + break; + case LLAISYS_DTYPE_F16: + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + N, D); + break; + case LLAISYS_DTYPE_BF16: + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + N, D); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA embedding: %d\n", weight_dtype); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] embedding kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cuh b/src/ops/embedding/nvidia/embedding_nvidia.cuh new file mode 100644 index 000000000..bc04454e0 --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.cuh @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, + llaisysDataType_t weight_dtype, + size_t N, size_t D); +} diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d06..016d389cf 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,50 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/embedding_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/embedding_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/embedding_metax.cuh" +#endif + namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); -} + CHECK_SAME_DEVICE(out, index, weight); + CHECK_SAME_DTYPE(out->dtype(), weight->dtype()); + ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), + "Embedding: all tensors must be contiguous."); + ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "Embedding: index tensor must be of type int64."); + ASSERT(index->ndim() == 1, "Embedding: index tensor must be 1D."); + ASSERT(weight->ndim() == 2, "Embedding: weight tensor must be 2D."); + ASSERT(out->ndim() == 2, "Embedding: out tensor must be 2D."); + ASSERT(out->shape()[0] == index->shape()[0], "Embedding: out.shape[0] must match index.shape[0]."); + ASSERT(out->shape()[1] == weight->shape()[1], "Embedding: out.shape[1] must match weight.shape[1]."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding(out->data(), index->data(), weight->data(), weight->dtype(), + index->shape()[0], weight->shape()[1]); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::embedding(out->data(), index->data(), weight->data(), weight->dtype(), + index->shape()[0], weight->shape()[1]); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::embedding(out->data(), index->data(), weight->data(), weight->dtype(), + index->shape()[0], weight->shape()[1]); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::embedding(out->data(), index->data(), weight->data(), weight->dtype(), + index->shape()[0], weight->shape()[1]); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } // namespace llaisys::ops +} diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp new file mode 100644 index 000000000..4fe05cc46 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.cpp @@ -0,0 +1,52 @@ +#include "linear_cpu.hpp" + +#include "../../../utils.hpp" +#include + +template +void linear_(T *out, const T *in, const T *weight, const T *bias, + size_t batch_size, size_t in_features, size_t out_features) { + using acc_t = std::conditional_t< + std::is_same_v || std::is_same_v, + float, T>; + + for (size_t i = 0; i < batch_size; ++i) { + for (size_t j = 0; j < out_features; ++j) { + acc_t sum = acc_t{}; + for (size_t k = 0; k < in_features; ++k) { + if constexpr (std::is_same_v) { + sum += in[i * in_features + k] * weight[j * in_features + k]; + } else { + sum += llaisys::utils::cast(in[i * in_features + k]) * llaisys::utils::cast(weight[j * in_features + k]); + } + } + if (bias != nullptr) { + if constexpr (std::is_same_v) { + sum += bias[j]; + } else { + sum += llaisys::utils::cast(bias[j]); + } + } + if constexpr (std::is_same_v) { + out[i * out_features + j] = sum; + } else { + out[i * out_features + j] = llaisys::utils::cast(sum); + } + } + } +} + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features) { + switch (type) { + case LLAISYS_DTYPE_F32: + return linear_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight), reinterpret_cast(bias), batch_size, in_features, out_features); + case LLAISYS_DTYPE_BF16: + return linear_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight), reinterpret_cast(bias), batch_size, in_features, out_features); + case LLAISYS_DTYPE_F16: + return linear_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight), reinterpret_cast(bias), batch_size, in_features, out_features); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp new file mode 100644 index 000000000..3d01b1c50 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features); +} \ No newline at end of file diff --git a/src/ops/linear/metax/linear_metax.cuh b/src/ops/linear/metax/linear_metax.cuh new file mode 100644 index 000000000..58e520967 --- /dev/null +++ b/src/ops/linear/metax/linear_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/linear_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/linear/metax/linear_metax.maca b/src/ops/linear/metax/linear_metax.maca new file mode 100644 index 000000000..ac93aa27f --- /dev/null +++ b/src/ops/linear/metax/linear_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/linear_nvidia.cu" diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu new file mode 100644 index 000000000..5c4fc3a05 --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.cu @@ -0,0 +1,91 @@ +#include "linear_nvidia.cuh" + +#include "../../../utils.hpp" + +#include + +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void linear_kernel(T *out, const T *in, const T *weight, const T *bias, + size_t batch_size, size_t in_features, size_t out_features) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t numel = batch_size * out_features; + if (idx >= numel) { + return; + } + + size_t row = idx / out_features; + size_t col = idx % out_features; + + if constexpr (std::is_same_v || std::is_same_v) { + float sum = 0.0f; + for (size_t k = 0; k < in_features; ++k) { + sum += utils::cast_device(in[row * in_features + k]) * + utils::cast_device(weight[col * in_features + k]); + } + if (bias != nullptr) { + sum += utils::cast_device(bias[col]); + } + out[idx] = utils::cast_device(sum); + } else { + double sum = 0.0; + for (size_t k = 0; k < in_features; ++k) { + sum += static_cast(in[row * in_features + k]) * + static_cast(weight[col * in_features + k]); + } + if (bias != nullptr) { + sum += static_cast(bias[col]); + } + out[idx] = static_cast(sum); + } +} + +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, + llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features) { + const int block_size = 256; + const size_t numel = batch_size * out_features; + const int num_blocks = static_cast((numel + block_size - 1) / block_size); + + switch (type) { + case LLAISYS_DTYPE_F32: + linear_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, in_features, out_features); + break; + case LLAISYS_DTYPE_F16: + linear_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, in_features, out_features); + break; + case LLAISYS_DTYPE_BF16: + linear_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, in_features, out_features); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA linear: %d\n", type); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] linear kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/linear/nvidia/linear_nvidia.cuh b/src/ops/linear/nvidia/linear_nvidia.cuh new file mode 100644 index 000000000..4cf7f6cba --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.cuh @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, + llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features); +} diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f8655..e5098b6d1 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,57 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/linear_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/linear_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/linear_metax.cuh" +#endif + namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + if (bias) CHECK_SAME_DEVICE(out, bias); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + CHECK_SAME_DTYPE(in->dtype(), weight->dtype()); + if (bias) CHECK_SAME_DTYPE(in->dtype(), bias->dtype()); + + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), + "Linear: output, input tensor and weight tensor must be contiguous."); + ASSERT(in->ndim() == 2, "Linear: input tensor must be 2D."); + ASSERT(weight->ndim() == 2, "Linear: weight tensor must be 2D."); + ASSERT(out->ndim() == 2, "Linear: output tensor must be 2D."); + ASSERT(in->shape()[1] == weight->shape()[1], "Linear: input and weight shapes are incompatible."); + ASSERT(out->shape()[0] == in->shape()[0], "Linear: out.shape[0] must match input batch size."); + ASSERT(out->shape()[1] == weight->shape()[0], "Linear: out.shape[1] must match weight.shape[0]."); + if (bias) { + ASSERT(bias->isContiguous(), "Linear: bias tensor must be contiguous."); + ASSERT(bias->ndim() == 1, "Linear: bias tensor must be 1D."); + ASSERT(bias->shape()[0] == weight->shape()[0], "Linear: bias shape must match weight.shape[0]."); + } + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::linear(out->data(), in->data(), weight->data(), bias ? bias->data() : nullptr, in->dtype(), in->shape()[0], in->shape()[1], weight->shape()[0]); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::linear(out->data(), in->data(), weight->data(), bias ? bias->data() : nullptr, in->dtype(), in->shape()[0], in->shape()[1], weight->shape()[0]); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::linear(out->data(), in->data(), weight->data(), bias ? bias->data() : nullptr, + in->dtype(), in->shape()[0], in->shape()[1], weight->shape()[0]); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::linear(out->data(), in->data(), weight->data(), bias ? bias->data() : nullptr, + in->dtype(), in->shape()[0], in->shape()[1], weight->shape()[0]); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp new file mode 100644 index 000000000..b2227f608 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp @@ -0,0 +1,93 @@ +#include "rearrange_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +static void rearrange_recursive( + T *out, + const T *in, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides, + size_t dim) { + + if (dim == shape.size() - 1) { + // Last dimension - copy elements + for (size_t i = 0; i < shape[dim]; i++) { + out[i * out_strides[dim]] = in[i * in_strides[dim]]; + } + } else { + // Recurse into next dimension + for (size_t i = 0; i < shape[dim]; i++) { + rearrange_recursive( + out + i * out_strides[dim], + in + i * in_strides[dim], + shape, + out_strides, + in_strides, + dim + 1); + } + } +} + +template +static void rearrange_( + T *out, + const T *in, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides) { + + if (shape.empty()) { + // Scalar copy + *out = *in; + return; + } + + rearrange_recursive(out, in, shape, out_strides, in_strides, 0); +} + +namespace llaisys::ops::cpu { + +void rearrange( + std::byte *out, + const std::byte *in, + llaisysDataType_t dtype, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides) { + + switch (dtype) { + case LLAISYS_DTYPE_F32: + return rearrange_( + reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_BF16: + return rearrange_( + reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_F16: + return rearrange_( + reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_I64: + return rearrange_( + reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_I32: + return rearrange_( + reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} + +} // namespace llaisys::ops::cpu diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp new file mode 100644 index 000000000..ee570ec9e --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp @@ -0,0 +1,15 @@ +#pragma once +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::cpu { +void rearrange( + std::byte *out, + const std::byte *in, + llaisysDataType_t dtype, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides); +} // namespace llaisys::ops::cpu diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp index 017a6ae59..fd6c51186 100644 --- a/src/ops/rearrange/op.cpp +++ b/src/ops/rearrange/op.cpp @@ -1,7 +1,42 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rearrange_cpu.hpp" + namespace llaisys::ops { void rearrange(tensor_t out, tensor_t in) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in); + CHECK_SAME_SHAPE(out->shape(), in->shape()); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + + // Always support CPU calculation + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rearrange( + out->data(), + in->data(), + out->dtype(), + out->shape(), + out->strides(), + in->strides()); + } + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_METAX_API) + if (out->deviceType() == LLAISYS_DEVICE_NVIDIA || out->deviceType() == LLAISYS_DEVICE_METAX) { + ASSERT(out->isContiguous() && in->isContiguous(), + "Rearrange CUDA: only contiguous tensors are supported for now."); + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + auto &runtime = llaisys::core::context().runtime(); + runtime.api()->memcpy_sync( + out->data(), + in->data(), + out->numel() * out->elementSize(), + LLAISYS_MEMCPY_D2D); + return; + } +#endif + + EXCEPTION_UNSUPPORTED_DEVICE; } } // namespace llaisys::ops diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp new file mode 100644 index 000000000..a494f1d2c --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp @@ -0,0 +1,47 @@ +#include "rms_norm_cpu.hpp" +#include "../../../utils.hpp" +#include +#include +template +void rms_norm_(T *out, const T *in, const T *weight,float eps, size_t N, size_t D) { + using acc_t = std::conditional_t || std::is_same_v, float, T>; + + for (size_t i = 0; i < N; ++i) { + acc_t square_sum = 0; + for (size_t j = 0; j < D; ++j) { + if constexpr(std::is_same_v) { + square_sum += in[i*D+j]*in[i*D+j]; + } else { + auto x = llaisys::utils::cast(in[i*D + j]); + square_sum += x*x; + } + } + auto RMS = std::sqrt(square_sum/D+eps); + for (size_t j = 0; j < D; ++j) { + if constexpr (std::is_same_v) { + out[i * D + j] = (in[i * D + j] / RMS) * weight[j]; + } else { + auto x = llaisys::utils::cast(in[i * D + j]); + auto w = llaisys::utils::cast(weight[j]); + out[i * D + j] = llaisys::utils::cast((x / RMS) * w); + } + } + } +} + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight,float eps, llaisysDataType_t dtype, size_t N, size_t D) { + switch (dtype) { + case LLAISYS_DTYPE_F32: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight),eps, N, D); + case LLAISYS_DTYPE_BF16: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight),eps, N, D); + case LLAISYS_DTYPE_F16: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight),eps, N, D); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp new file mode 100644 index 000000000..756fbadb8 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { + void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight,float eps, llaisysDataType_t dtype, size_t N, size_t D); +} \ No newline at end of file diff --git a/src/ops/rms_norm/metax/rms_norm_metax.cuh b/src/ops/rms_norm/metax/rms_norm_metax.cuh new file mode 100644 index 000000000..ab551ec3c --- /dev/null +++ b/src/ops/rms_norm/metax/rms_norm_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/rms_norm_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/rms_norm/metax/rms_norm_metax.maca b/src/ops/rms_norm/metax/rms_norm_metax.maca new file mode 100644 index 000000000..c17ccabfe --- /dev/null +++ b/src/ops/rms_norm/metax/rms_norm_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/rms_norm_nvidia.cu" diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu new file mode 100644 index 000000000..89971f807 --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -0,0 +1,89 @@ +#include "rms_norm_nvidia.cuh" + +#include "../../../utils.hpp" + +#include + +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void rms_norm_kernel(T *out, const T *in, const T *weight, float eps, size_t D) { + size_t row = blockIdx.x; + size_t tid = threadIdx.x; + + extern __shared__ float shared[]; + float local_sum = 0.0f; + + for (size_t col = tid; col < D; col += blockDim.x) { + float value = utils::cast_device(in[row * D + col]); + local_sum += value * value; + } + + shared[tid] = local_sum; + __syncthreads(); + + for (size_t stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + shared[tid] += shared[tid + stride]; + } + __syncthreads(); + } + + float inv_rms = rsqrtf(shared[0] / static_cast(D) + eps); + for (size_t col = tid; col < D; col += blockDim.x) { + if constexpr (std::is_same_v || std::is_same_v) { + float x = utils::cast_device(in[row * D + col]); + float w = utils::cast_device(weight[col]); + out[row * D + col] = utils::cast_device(x * inv_rms * w); + } else { + out[row * D + col] = (in[row * D + col] * inv_rms) * weight[col]; + } + } +} + +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, + llaisysDataType_t dtype, size_t N, size_t D) { + const int block_size = 256; + const int num_blocks = static_cast(N); + const size_t shared_mem = block_size * sizeof(float); + + switch (dtype) { + case LLAISYS_DTYPE_F32: + rms_norm_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + eps, D); + break; + case LLAISYS_DTYPE_F16: + rms_norm_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + eps, D); + break; + case LLAISYS_DTYPE_BF16: + rms_norm_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + eps, D); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA rms_norm: %d\n", dtype); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] rms_norm kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh new file mode 100644 index 000000000..ca3b4c0ee --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, + llaisysDataType_t dtype, size_t N, size_t D); +} diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9d..5a569f6f2 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,49 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rms_norm_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rms_norm_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/rms_norm_metax.cuh" +#endif + namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), + "RMS Norm: inputs must be contiguous."); + ASSERT(in->ndim() == 2, "RMS Norm: input tensor must be 2D."); + ASSERT(out->ndim() == 2, "RMS Norm: output tensor must be 2D."); + ASSERT(weight->ndim() == 1, "RMS Norm: weight tensor must be 1D."); + + size_t N = in->shape()[0]; + size_t D = in->shape()[1]; + ASSERT(out->shape() == in->shape(), "RMS Norm: output shape must match input shape."); + ASSERT(weight->shape()[0] == D, "RMS Norm: weight shape must match input hidden size."); + + // always support cpu calculation + if (in->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm(out->data(), in->data(), weight->data(), eps, in->dtype(), N, D); + } + + llaisys::core::context().setDevice(in->deviceType(), in->deviceId()); + + switch (in->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rms_norm(out->data(), in->data(), weight->data(), eps, in->dtype(), N, D); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rms_norm(out->data(), in->data(), weight->data(), eps, in->dtype(), N, D); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::rms_norm(out->data(), in->data(), weight->data(), eps, in->dtype(), N, D); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp new file mode 100644 index 000000000..afe6628ec --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.cpp @@ -0,0 +1,47 @@ +#include "../../../utils.hpp" +#include +#include +#include +#include "rope_cpu.hpp" +template +void rope_(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads, size_t head_dim) { + size_t half_dim = head_dim / 2; + for (size_t i = 0; i < seq_len; ++i) { + auto pos = pos_ids[i]; + for (size_t j = 0; j < n_heads; ++j) { + size_t offset = i * head_dim * n_heads + j * head_dim; + for (size_t k = 0; k < half_dim; ++k) { + float angle = pos * 1.0f / std::pow(theta, 2.0f*k/head_dim); + float sin_angle = std::sin(angle); + float cos_angle = std::cos(angle); + + auto a_i_j = in[offset + k]; + auto b_i_j = in[offset + k + half_dim]; + if constexpr (std::is_same_v) { + out[offset + k] = a_i_j * cos_angle - b_i_j * sin_angle; + out[offset + k + half_dim] = b_i_j * cos_angle + a_i_j * sin_angle; + } else { + auto a = llaisys::utils::cast(a_i_j); + auto b = llaisys::utils::cast(b_i_j); + out[offset + k] = llaisys::utils::cast(a * cos_angle - b * sin_angle); + out[offset + k + half_dim] = llaisys::utils::cast(b * cos_angle + a * sin_angle); + } + } + } + } +} + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t dtype, size_t seq_len, size_t n_heads, size_t head_dim) { + switch (dtype) { + case LLAISYS_DTYPE_F32: + return rope_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(pos_ids), theta, seq_len, n_heads, head_dim); + case LLAISYS_DTYPE_BF16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(pos_ids), theta, seq_len, n_heads, head_dim); + case LLAISYS_DTYPE_F16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(pos_ids), theta, seq_len, n_heads, head_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp new file mode 100644 index 000000000..cc7f319cc --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + float theta, llaisysDataType_t dtype, + size_t seq_len, size_t n_heads, size_t head_dim); +} \ No newline at end of file diff --git a/src/ops/rope/metax/rope_metax.cuh b/src/ops/rope/metax/rope_metax.cuh new file mode 100644 index 000000000..525debee9 --- /dev/null +++ b/src/ops/rope/metax/rope_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/rope_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/rope/metax/rope_metax.maca b/src/ops/rope/metax/rope_metax.maca new file mode 100644 index 000000000..f68c2b01c --- /dev/null +++ b/src/ops/rope/metax/rope_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/rope_nvidia.cu" diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu new file mode 100644 index 000000000..505027a30 --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.cu @@ -0,0 +1,88 @@ +#include "rope_nvidia.cuh" + +#include "../../../utils.hpp" + +#include + +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void rope_kernel(T *out, const T *in, const int64_t *pos_ids, float theta, + size_t seq_len, size_t n_heads, size_t head_dim, size_t half_dim) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t numel = seq_len * n_heads * half_dim; + if (idx >= numel) { + return; + } + + size_t k = idx % half_dim; + size_t tmp = idx / half_dim; + size_t head = tmp % n_heads; + size_t seq = tmp / n_heads; + + size_t offset = seq * n_heads * head_dim + head * head_dim; + float angle = static_cast(pos_ids[seq]) / powf(theta, 2.0f * static_cast(k) / static_cast(head_dim)); + float sin_angle = sinf(angle); + float cos_angle = cosf(angle); + + if constexpr (std::is_same_v || std::is_same_v) { + float a = utils::cast_device(in[offset + k]); + float b = utils::cast_device(in[offset + k + half_dim]); + out[offset + k] = utils::cast_device(a * cos_angle - b * sin_angle); + out[offset + k + half_dim] = utils::cast_device(b * cos_angle + a * sin_angle); + } else { + T a = in[offset + k]; + T b = in[offset + k + half_dim]; + out[offset + k] = a * cos_angle - b * sin_angle; + out[offset + k + half_dim] = b * cos_angle + a * sin_angle; + } +} + +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + float theta, llaisysDataType_t dtype, + size_t seq_len, size_t n_heads, size_t head_dim) { + const size_t half_dim = head_dim / 2; + const size_t numel = seq_len * n_heads * half_dim; + const int block_size = 256; + const int num_blocks = static_cast((numel + block_size - 1) / block_size); + + switch (dtype) { + case LLAISYS_DTYPE_F32: + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + theta, seq_len, n_heads, head_dim, half_dim); + break; + case LLAISYS_DTYPE_F16: + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + theta, seq_len, n_heads, head_dim, half_dim); + break; + case LLAISYS_DTYPE_BF16: + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + theta, seq_len, n_heads, head_dim, half_dim); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA rope: %d\n", dtype); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] rope kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rope/nvidia/rope_nvidia.cuh b/src/ops/rope/nvidia/rope_nvidia.cuh new file mode 100644 index 000000000..2738bfd90 --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.cuh @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + float theta, llaisysDataType_t dtype, + size_t seq_len, size_t n_heads, size_t head_dim); +} diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64e..5fee1b78d 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,54 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rope_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rope_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/rope_metax.cuh" +#endif + namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, pos_ids); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), "RoPE: inputs must be contiguous."); + ASSERT(in->ndim() == 3, "RoPE: input tensor must be 3D."); + ASSERT(out->ndim() == 3, "RoPE: output tensor must be 3D."); + ASSERT(pos_ids->ndim() == 1, "RoPE: pos_ids tensor must be 1D."); + ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids tensor must be int64."); + ASSERT(out->shape() == in->shape(), "RoPE: output shape must match input shape."); + + size_t seq_len = in->shape()[0]; + size_t n_heads = in->shape()[1]; + size_t head_dim = in->shape()[2]; + ASSERT(pos_ids->shape()[0] == seq_len, "RoPE: pos_ids length must match sequence length."); + ASSERT(head_dim % 2 == 0, "RoPE: head dimension must be even."); + + if (in->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope(out->data(), in->data(), pos_ids->data(), + theta, in->dtype(), seq_len, n_heads, head_dim); + } + + llaisys::core::context().setDevice(in->deviceType(), in->deviceId()); + + switch (in->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rope(out->data(), in->data(), pos_ids->data(), + theta, in->dtype(), seq_len, n_heads, head_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rope(out->data(), in->data(), pos_ids->data(), + theta, in->dtype(), seq_len, n_heads, head_dim); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::rope(out->data(), in->data(), pos_ids->data(), + theta, in->dtype(), seq_len, n_heads, head_dim); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp new file mode 100644 index 000000000..543fd8125 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp @@ -0,0 +1,94 @@ +#include "self_attention_cpu.hpp" +#include "../../../utils.hpp" +#include +#include +#include +#include +#include +#include + +template +void self_attention_(T *attn_val, const T *q, const T *k, const T *v, + float scale, size_t seq_len, size_t total_len, + size_t nhead, size_t nkvhead, size_t d, size_t dv) { + using acc_t = std::conditional_t< + std::is_same_v || std::is_same_v, + float, T>; + + size_t n_rep = nhead / nkvhead; + std::vector scores(total_len); + + for (size_t i = 0; i < seq_len; ++i) { + size_t global_pos = total_len - seq_len + i; + size_t valid_len = std::min(global_pos + 1, total_len); + + for (size_t h = 0; h < nhead; ++h) { + size_t kv_h = h / n_rep; + + const T *q_ptr = q + (i * nhead * d) + (h * d); + T *out_ptr = attn_val + (i * nhead * dv) + (h * dv); + + acc_t max_val = -std::numeric_limits::infinity(); + + // 1. Calculate Scores (Only valid positions) + for (size_t t = 0; t < valid_len; ++t) { + const T *k_ptr = k + (t * nkvhead * d) + (kv_h * d); + acc_t dot = 0; + for (size_t j = 0; j < d; ++j) { + dot += llaisys::utils::cast(q_ptr[j]) * + llaisys::utils::cast(k_ptr[j]); + } + scores[t] = dot * scale; + if (scores[t] > max_val) { + max_val = scores[t]; + } + } + + // 2. Softmax (Only valid positions) + acc_t sum_exp = 0; + for (size_t t = 0; t < valid_len; ++t) { + scores[t] = std::exp(scores[t] - max_val); + sum_exp += scores[t]; + } + + // 3. Weighted Sum (Only valid positions) + for (size_t j = 0; j < dv; ++j) { + acc_t val = 0; + for (size_t t = 0; t < valid_len; ++t) { + const T *v_ptr = v + (t * nkvhead * dv) + (kv_h * dv); + val += scores[t] * llaisys::utils::cast(v_ptr[j]); + } + out_ptr[j] = llaisys::utils::cast(val / sum_exp); + } + } + } +} + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + float scale, llaisysDataType_t dtype, + size_t seq_len, size_t total_len, size_t nhead, size_t nkvhead, size_t d, size_t dv) { + switch (dtype) { + case LLAISYS_DTYPE_F32: + return self_attention_(reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, seq_len, total_len, nhead, nkvhead, d, dv); + case LLAISYS_DTYPE_BF16: + return self_attention_(reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, seq_len, total_len, nhead, nkvhead, d, dv); + case LLAISYS_DTYPE_F16: + return self_attention_(reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, seq_len, total_len, nhead, nkvhead, d, dv); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp new file mode 100644 index 000000000..4bf193bbf --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + float scale, llaisysDataType_t dtype, + size_t seq_len, size_t total_len, size_t nhead, size_t nkvhead, size_t d, size_t dv); +} diff --git a/src/ops/self_attention/metax/self_attention_metax.cuh b/src/ops/self_attention/metax/self_attention_metax.cuh new file mode 100644 index 000000000..d43b3412f --- /dev/null +++ b/src/ops/self_attention/metax/self_attention_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/self_attention_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/self_attention/metax/self_attention_metax.maca b/src/ops/self_attention/metax/self_attention_metax.maca new file mode 100644 index 000000000..aab67a344 --- /dev/null +++ b/src/ops/self_attention/metax/self_attention_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/self_attention_nvidia.cu" diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu new file mode 100644 index 000000000..e7a8005f3 --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu @@ -0,0 +1,123 @@ +#include "self_attention_nvidia.cuh" + +#include "../../../utils.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void self_attention_kernel_impl(T *attn_val, const T *q, const T *k, const T *v, + float scale, size_t seq_len, size_t total_len, + size_t nhead, size_t nkvhead, size_t d, size_t dv) { + if (threadIdx.x != 0) {return;} + + extern __shared__ double scores[]; + + size_t query_idx = blockIdx.x; + size_t head_idx = blockIdx.y; + size_t n_rep = nhead / nkvhead; + size_t kv_head_idx = head_idx / n_rep; + + size_t global_pos = total_len - seq_len + query_idx; + size_t valid_len = global_pos + 1; + if (valid_len > total_len) { + valid_len = total_len; + } + + const T *q_ptr = q + (query_idx * nhead * d) + (head_idx * d); + T *out_ptr = attn_val + (query_idx * nhead * dv) + (head_idx * dv); + + double max_val = -DBL_MAX; + for (size_t t = 0; t < valid_len; ++t) { + const T *k_ptr = k + (t * nkvhead * d) + (kv_head_idx * d); + double dot = 0.0; + for (size_t j = 0; j < d; ++j) { + if constexpr (std::is_same_v || std::is_same_v) { + dot += static_cast(utils::cast_device(q_ptr[j])) * + static_cast(utils::cast_device(k_ptr[j])); + } else { + dot += static_cast(q_ptr[j]) * static_cast(k_ptr[j]); + } + } + scores[t] = dot * static_cast(scale); + if (scores[t] > max_val) { + max_val = scores[t]; + } + } + + double sum_exp = 0.0; + for (size_t t = 0; t < valid_len; ++t) { + scores[t] = exp(scores[t] - max_val); + sum_exp += scores[t]; + } + + for (size_t j = 0; j < dv; ++j) { + double value = 0.0; + for (size_t t = 0; t < valid_len; ++t) { + const T *v_ptr = v + (t * nkvhead * dv) + (kv_head_idx * dv); + if constexpr (std::is_same_v || std::is_same_v) { + value += scores[t] * static_cast(utils::cast_device(v_ptr[j])); + } else { + value += scores[t] * static_cast(v_ptr[j]); + } + } + if constexpr (std::is_same_v || std::is_same_v) { + out_ptr[j] = utils::cast_device(static_cast(value / sum_exp)); + } else { + out_ptr[j] = static_cast(value / sum_exp); + } + } +} + +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + float scale, llaisysDataType_t dtype, + size_t seq_len, size_t total_len, size_t nhead, size_t nkvhead, size_t d, size_t dv) { + const dim3 num_blocks(static_cast(seq_len), static_cast(nhead), 1); + const int block_size = 1; + const size_t shared_mem = total_len * sizeof(double); + + switch (dtype) { + case LLAISYS_DTYPE_F32: + self_attention_kernel_impl<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, seq_len, total_len, nhead, nkvhead, d, dv); + break; + case LLAISYS_DTYPE_F16: + self_attention_kernel_impl<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, seq_len, total_len, nhead, nkvhead, d, dv); + break; + case LLAISYS_DTYPE_BF16: + self_attention_kernel_impl<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, seq_len, total_len, nhead, nkvhead, d, dv); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA self_attention: %d\n", dtype); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] self_attention kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cuh b/src/ops/self_attention/nvidia/self_attention_nvidia.cuh new file mode 100644 index 000000000..55a5f922f --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cuh @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + float scale, llaisysDataType_t dtype, + size_t seq_len, size_t total_len, size_t nhead, size_t nkvhead, size_t d, size_t dv); +} diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d620142..15fcc6272 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,60 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/self_attention_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/self_attention_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/self_attention_metax.cuh" +#endif + namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(attn_val, q, k, v); + CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype()); + ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(), + "Self attention: all tensors must be contiguous."); + ASSERT(attn_val->ndim() == 3 && q->ndim() == 3 && k->ndim() == 3 && v->ndim() == 3, + "Self attention: all tensors must be 3D."); + + size_t seq_len = q->shape()[0]; + size_t nhead = q->shape()[1]; + size_t d = q->shape()[2]; + size_t total_len = k->shape()[0]; + size_t nkvhead = k->shape()[1]; + size_t dv = v->shape()[2]; + ASSERT(attn_val->shape()[0] == seq_len, "Self attention: output seq_len must match q."); + ASSERT(attn_val->shape()[1] == nhead, "Self attention: output nhead must match q."); + ASSERT(k->shape()[0] == v->shape()[0], "Self attention: k and v total_len must match."); + ASSERT(k->shape()[1] == v->shape()[1], "Self attention: k and v nhead must match."); + ASSERT(d == k->shape()[2], "Self attention: q and k head_dim must match."); + ASSERT(dv == attn_val->shape()[2], "Self attention: output value dim must match v."); + ASSERT(nkvhead > 0 && nhead % nkvhead == 0, "Self attention: nhead must be divisible by nkvhead."); + + if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, + attn_val->dtype(), seq_len, total_len, nhead, nkvhead, d, dv); + } + + llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId()); + + switch (attn_val->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, + attn_val->dtype(), seq_len, total_len, nhead, nkvhead, d, dv); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, + attn_val->dtype(), seq_len, total_len, nhead, nkvhead, d, dv); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, + attn_val->dtype(), seq_len, total_len, nhead, nkvhead, d, dv); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp new file mode 100644 index 000000000..9cd15d278 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp @@ -0,0 +1,43 @@ +#include "swiglu_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +void swiglu_(T *out, const T *gate, const T *up, size_t numel) { + for (size_t i = 0; i < numel; i++) { + if constexpr (std::is_same_v || std::is_same_v) { + float gate_val = llaisys::utils::cast(gate[i]); + float up_val = llaisys::utils::cast(up[i]); + // silu(gate) = gate / (1 + exp(-gate)) + float silu_gate = gate_val / (1.0f + std::exp(-gate_val)); + float result = up_val * silu_gate; + out[i] = llaisys::utils::cast(result); + } else { + T gate_val = gate[i]; + T up_val = up[i]; + // silu(gate) = gate / (1 + exp(-gate)) + T silu_gate = gate_val / (static_cast(1) + std::exp(-gate_val)); + out[i] = up_val * silu_gate; + } + } +} + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return swiglu_(reinterpret_cast(out), reinterpret_cast(gate), + reinterpret_cast(up), numel); + case LLAISYS_DTYPE_BF16: + return swiglu_(reinterpret_cast(out), reinterpret_cast(gate), + reinterpret_cast(up), numel); + case LLAISYS_DTYPE_F16: + return swiglu_(reinterpret_cast(out), reinterpret_cast(gate), + reinterpret_cast(up), numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp new file mode 100644 index 000000000..9bc2fd2d9 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/swiglu/metax/swiglu_metax.cuh b/src/ops/swiglu/metax/swiglu_metax.cuh new file mode 100644 index 000000000..fbf35ec69 --- /dev/null +++ b/src/ops/swiglu/metax/swiglu_metax.cuh @@ -0,0 +1,6 @@ +#pragma once +#include "../nvidia/swiglu_nvidia.cuh" + +namespace llaisys::ops { +namespace metax = nvidia; +} diff --git a/src/ops/swiglu/metax/swiglu_metax.maca b/src/ops/swiglu/metax/swiglu_metax.maca new file mode 100644 index 000000000..07378ac4d --- /dev/null +++ b/src/ops/swiglu/metax/swiglu_metax.maca @@ -0,0 +1 @@ +#include "../nvidia/swiglu_nvidia.cu" diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu new file mode 100644 index 000000000..eb68b179b --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -0,0 +1,66 @@ +#include "swiglu_nvidia.cuh" +#include "../../../utils.hpp" + +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void swiglu_kernel(T *out, const T *gate, const T *up, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + if constexpr (std::is_same_v || std::is_same_v) { + float g = utils::cast_device(gate[idx]); + float u = utils::cast_device(up[idx]); + float silu_gate = g / (1.0f + expf(-g)); + out[idx] = utils::cast_device(u * silu_gate); + } else { + T g = gate[idx]; + T u = up[idx]; + T silu_gate = g / (static_cast(1) + expf(-g)); + out[idx] = u * silu_gate; + } + } +} + +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel) { + const int block_size = 256; + const int num_blocks = (numel + block_size - 1) / block_size; + + switch (type) { + case LLAISYS_DTYPE_F32: + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + break; + case LLAISYS_DTYPE_F16: + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + break; + case LLAISYS_DTYPE_BF16: + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + break; + default: + std::fprintf(stderr, "[ERROR] Unsupported data type for CUDA swiglu: %d\n", type); + throw std::runtime_error("Unsupported data type"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::fprintf(stderr, "[CUDA ERROR] swiglu kernel launch failed: %s\n", cudaGetErrorString(err)); + throw std::runtime_error(cudaGetErrorString(err)); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cuh b/src/ops/swiglu/nvidia/swiglu_nvidia.cuh new file mode 100644 index 000000000..c94306d5c --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cuh @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 47edbcc97..b28ec6e69 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -1,7 +1,40 @@ #include "op.hpp" +#include "../../utils.hpp" + +#include "cpu/swiglu_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/swiglu_nvidia.cuh" +#elif defined(ENABLE_METAX_API) +#include "metax/swiglu_metax.cuh" +#endif + namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, gate, up); + CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape()); + CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype()); + ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(), + "SwiGLU: all tensors must be contiguous."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); +#elif defined(ENABLE_METAX_API) + case LLAISYS_DEVICE_METAX: + return metax::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 2f594bb65..708e62642 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -2,9 +2,11 @@ #include "../utils.hpp" +#include #include #include #include +#include namespace llaisys { @@ -164,27 +166,61 @@ void Tensor::debug() const { } bool Tensor::isContiguous() const { - TO_BE_IMPLEMENTED(); + size_t tensor_dim = this->ndim(); + size_t tmp_dim = 1; + for (size_t i = tensor_dim; i-- > 0;) { + if (static_cast(tmp_dim) != this->strides()[i]) { + return false; + } + tmp_dim *= this->shape()[i]; + } + return true; } tensor_t Tensor::permute(const std::vector &order) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + std::vector new_shape (ndim()); + std::vector new_strides(ndim()); + for (size_t i = 0; i < order.size(); ++i) { + new_shape[i] = this->shape()[order[i]]; + new_strides[i] = this->strides()[order[i]]; + } + auto new_meta = TensorMeta{dtype(), new_shape, new_strides}; + return std::shared_ptr(new Tensor(new_meta, _storage)); } tensor_t Tensor::view(const std::vector &shape) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + size_t new_dim = shape.size(); + std::vector new_strides(new_dim); + size_t stride = 1; + for (size_t i = new_dim; i-- >0;) { + new_strides[i] = stride; + stride *= shape[i]; + } + + if (this->numel() != stride || !isContiguous()) { + EXCEPTION_TRANSFORM_SHAPE; + } + + auto new_meta = TensorMeta{dtype(), shape, new_strides}; + return std::shared_ptr(new Tensor(new_meta, this->_storage, this->_offset)); } tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + auto new_shape(this->shape()); + new_shape[dim] = end - start; + auto new_offset = this->_offset + start * this->strides()[dim] * elementSize(); + auto new_meta = TensorMeta{dtype(), new_shape, this->strides()}; + return std::shared_ptr(new Tensor(new_meta, this->_storage,new_offset)); } void Tensor::load(const void *src_) { - TO_BE_IMPLEMENTED(); + core::context().setDevice(this->deviceType(), this->deviceId()); + core::context().runtime().api()->memcpy_sync( + this->data(), + src_, + this->numel() * this->elementSize(), + LLAISYS_MEMCPY_H2D); } tensor_t Tensor::contiguous() const { diff --git a/src/utils/check.hpp b/src/utils/check.hpp index 82de2a7ea..3e919f950 100644 --- a/src/utils/check.hpp +++ b/src/utils/check.hpp @@ -86,3 +86,9 @@ } \ } \ } while (0) + +#define EXCEPTION_TRANSFORM_SHAPE \ + do { \ + std::cerr << "[ERROR] Failed to transform shape" << EXCEPTION_LOCATION_MSG << std::endl; \ + throw std::invalid_argument("Numel mismatch or not contiguous"); \ + } while (0) diff --git a/src/utils/types.hpp b/src/utils/types.hpp index e09619db8..8d51aea93 100644 --- a/src/utils/types.hpp +++ b/src/utils/types.hpp @@ -2,6 +2,14 @@ #include #include +#include + +#if defined(__CUDACC__) || defined(__CUDA_ARCH__) +#define LLAISYS_HOST_DEVICE __host__ __device__ +#include +#else +#define LLAISYS_HOST_DEVICE +#endif namespace llaisys { struct CustomFloat16 { @@ -107,36 +115,89 @@ inline const char *dtype_to_str(llaisysDataType_t dtype) { } } +// Host-only functions (implemented in types.cpp) float _f16_to_f32(fp16_t val); fp16_t _f32_to_f16(float val); - float _bf16_to_f32(bf16_t val); bf16_t _f32_to_bf16(float val); +// Device-compatible conversion functions +LLAISYS_HOST_DEVICE inline float _f16_to_f32_device(fp16_t val) { +#if defined(__CUDA_ARCH__) || defined(__CUDACC__) + // Use CUDA half arithmetic on device + half h = __ushort_as_half(val._v); + return __half2float(h); +#else + // Fall back to host implementation + return _f16_to_f32(val); +#endif +} + +LLAISYS_HOST_DEVICE inline fp16_t _f32_to_f16_device(float val) { +#if defined(__CUDA_ARCH__) || defined(__CUDACC__) + // Use CUDA half arithmetic on device + half h = __float2half(val); + return fp16_t{__half_as_ushort(h)}; +#else + // Fall back to host implementation + return _f32_to_f16(val); +#endif +} + +LLAISYS_HOST_DEVICE inline float _bf16_to_f32_device(bf16_t val) { +#if defined(__CUDA_ARCH__) || defined(__CUDACC__) + // bf16 is stored as upper 16 bits of float32 + uint32_t bits = static_cast(val._v) << 16; + return *reinterpret_cast(&bits); +#else + // Fall back to host implementation + return _bf16_to_f32(val); +#endif +} + +LLAISYS_HOST_DEVICE inline bf16_t _f32_to_bf16_device(float val) { +#if defined(__CUDA_ARCH__) || defined(__CUDACC__) + // Convert float to bf16 by taking upper 16 bits (with rounding) + uint32_t bits = *reinterpret_cast(&val); + uint32_t rounding_bias = 0x00007FFF + ((bits >> 16) & 1); + return bf16_t{static_cast((bits + rounding_bias) >> 16)}; +#else + // Fall back to host implementation + return _f32_to_bf16(val); +#endif +} + +// Device-compatible cast function template -TypeTo cast(TypeFrom val) { +LLAISYS_HOST_DEVICE TypeTo cast_device(TypeFrom val) { if constexpr (std::is_same::value) { return val; } else if constexpr (std::is_same::value && std::is_same::value) { - return _f32_to_f16(val); + return _f32_to_f16_device(val); } else if constexpr (std::is_same::value && !std::is_same::value) { - return _f32_to_f16(static_cast(val)); + return _f32_to_f16_device(static_cast(val)); } else if constexpr (std::is_same::value && std::is_same::value) { - return _f16_to_f32(val); + return _f16_to_f32_device(val); } else if constexpr (std::is_same::value && !std::is_same::value) { - return static_cast(_f16_to_f32(val)); + return static_cast(_f16_to_f32_device(val)); } else if constexpr (std::is_same::value && std::is_same::value) { - return _f32_to_bf16(val); + return _f32_to_bf16_device(val); } else if constexpr (std::is_same::value && !std::is_same::value) { - return _f32_to_bf16(static_cast(val)); + return _f32_to_bf16_device(static_cast(val)); } else if constexpr (std::is_same::value && std::is_same::value) { - return _bf16_to_f32(val); + return _bf16_to_f32_device(val); } else if constexpr (std::is_same::value && !std::is_same::value) { - return static_cast(_bf16_to_f32(val)); + return static_cast(_bf16_to_f32_device(val)); } else { return static_cast(val); } } +// Original host-only cast function (for backward compatibility) +template +TypeTo cast(TypeFrom val) { + return cast_device(val); +} + } // namespace utils } // namespace llaisys diff --git a/test/ops/add.py b/test/ops/add.py index bb8bf8ca8..f7b2989e4 100644 --- a/test/ops/add.py +++ b/test/ops/add.py @@ -42,7 +42,7 @@ def test_op_add( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [(2, 3), (512, 4096)] diff --git a/test/ops/argmax.py b/test/ops/argmax.py index d0f7ee298..87a5d970d 100644 --- a/test/ops/argmax.py +++ b/test/ops/argmax.py @@ -43,7 +43,7 @@ def test_op_argmax( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [(4,), (4096,)] diff --git a/test/ops/embedding.py b/test/ops/embedding.py index 99cadc1b8..17286babf 100644 --- a/test/ops/embedding.py +++ b/test/ops/embedding.py @@ -39,7 +39,7 @@ def test_op_embedding( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [ diff --git a/test/ops/linear.py b/test/ops/linear.py index 38897331f..24a38190b 100644 --- a/test/ops/linear.py +++ b/test/ops/linear.py @@ -49,7 +49,7 @@ def test_op_linear( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [ diff --git a/test/ops/rms_norm.py b/test/ops/rms_norm.py index 67b789e3f..b4b62d27b 100644 --- a/test/ops/rms_norm.py +++ b/test/ops/rms_norm.py @@ -48,7 +48,7 @@ def test_op_rms_norm( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [(1, 4), (512, 4096)] diff --git a/test/ops/rope.py b/test/ops/rope.py index fe59dd11c..bfb620b24 100644 --- a/test/ops/rope.py +++ b/test/ops/rope.py @@ -63,7 +63,7 @@ def test_op_rope( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [ diff --git a/test/ops/self_attention.py b/test/ops/self_attention.py index a042b51be..8b478952c 100644 --- a/test/ops/self_attention.py +++ b/test/ops/self_attention.py @@ -15,7 +15,7 @@ def torch_self_attention(attn_val, query, key, value, scale): L, S = query.size(-2), key.size(-2) attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device) - temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=S-L) + temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).tril(diagonal=S-L) attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) attn_bias.to(query.dtype) @@ -65,7 +65,7 @@ def test_op_self_attention( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [ diff --git a/test/ops/swiglu.py b/test/ops/swiglu.py index 1fa08f739..1a1880565 100644 --- a/test/ops/swiglu.py +++ b/test/ops/swiglu.py @@ -42,7 +42,7 @@ def test_op_swiglu( import argparse parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--profile", action="store_true") args = parser.parse_args() testShapes = [(2, 3), (512, 4096)] diff --git a/test/test_infer.py b/test/test_infer.py index 59d06b874..1515ae058 100644 --- a/test/test_infer.py +++ b/test/test_infer.py @@ -81,7 +81,7 @@ def llaisys_infer( if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--prompt", default="Who are you?", type=str) parser.add_argument("--max_steps", default=128, type=int) diff --git a/test/test_runtime.py b/test/test_runtime.py index e2ac218a1..1961a1aa7 100644 --- a/test/test_runtime.py +++ b/test/test_runtime.py @@ -55,7 +55,7 @@ def test_memcpy(api, size_bytes: int): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str) + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia", "metax"], type=str) args = parser.parse_args() test_basic_runtime_api(args.device) diff --git a/test/test_utils.py b/test/test_utils.py index 0f38f0c8e..545907a4e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,7 +1,6 @@ import llaisys import torch - def random_tensor( shape, dtype_name, device_name, device_id=0, scale=None, bias=None ) -> tuple[torch.Tensor, llaisys.Tensor]: @@ -186,7 +185,7 @@ def time_op(func): def torch_device(device_name: str, device_id=0): if device_name == "cpu": return torch.device("cpu") - elif device_name == "nvidia": + elif device_name in ("nvidia", "metax"): return torch.device(f"cuda:{device_id}") else: raise ValueError(f"Unsupported device name: {device_name}") @@ -197,6 +196,8 @@ def llaisys_device(device_name: str): return llaisys.DeviceType.CPU elif device_name == "nvidia": return llaisys.DeviceType.NVIDIA + elif device_name == "metax": + return llaisys.DeviceType.METAX else: raise ValueError(f"Unsupported device name: {device_name}") @@ -206,6 +207,8 @@ def device_name(llaisys_device: llaisys.DeviceType): return "cpu" elif llaisys_device == llaisys.DeviceType.NVIDIA: return "nvidia" + elif llaisys_device == llaisys.DeviceType.METAX: + return "metax" else: raise ValueError(f"Unsupported llaisys device: {llaisys_device}") diff --git a/xmake.lua b/xmake.lua index 1f65f7a95..c6832552a 100644 --- a/xmake.lua +++ b/xmake.lua @@ -3,6 +3,10 @@ set_encodings("utf-8") add_includedirs("include") +local function has_gpu_backend() + return has_config("nv-gpu") or has_config("mx-gpu") +end + -- CPU -- includes("xmake/cpu.lua") @@ -13,9 +17,43 @@ option("nv-gpu") set_description("Whether to compile implementations for Nvidia GPU") option_end() -if has_config("nv-gpu") then +-- MetaX / MACA -- +option("mx-gpu") + set_default(false) + set_showmenu(true) + set_description("Whether to compile implementations for MetaX GPU with MACA") +option_end() + +option("maca-path") + set_default("/opt/maca") + set_showmenu(true) + set_description("MACA toolkit path") +option_end() + +option("mxdriver-path") + set_default("/opt/mxdriver") + set_showmenu(true) + set_description("MetaX driver path") +option_end() + +local use_nv_gpu = has_config("nv-gpu") +local use_mx_gpu = has_config("mx-gpu") + +if use_nv_gpu and use_mx_gpu then + raise("Please enable only one GPU backend at a time: nv-gpu or mx-gpu") +end + +if use_nv_gpu then add_defines("ENABLE_NVIDIA_API") includes("xmake/nvidia.lua") + + if is_plat("linux") then + add_sysincludedirs("/usr/local/cuda/include") + add_linkdirs("/usr/local/cuda/lib64") + end +elseif use_mx_gpu then + add_defines("ENABLE_METAX_API") + includes("xmake/metax.lua") end target("llaisys-utils") @@ -37,6 +75,13 @@ target("llaisys-device") set_kind("static") add_deps("llaisys-utils") add_deps("llaisys-device-cpu") + if has_config("nv-gpu") then + add_deps("llaisys-device-nvidia") + end + if has_config("mx-gpu") then + add_deps("llaisys-device-metax") + add_links("llaisys-device-metax") + end set_languages("cxx17") set_warnings("all", "error") @@ -53,6 +98,12 @@ target("llaisys-core") set_kind("static") add_deps("llaisys-utils") add_deps("llaisys-device") + if has_config("nv-gpu") then + add_deps("llaisys-device-nvidia") + end + if has_config("mx-gpu") then + add_deps("llaisys-device-metax") + end set_languages("cxx17") set_warnings("all", "error") @@ -83,18 +134,41 @@ target_end() target("llaisys-ops") set_kind("static") add_deps("llaisys-ops-cpu") + if has_config("nv-gpu") then + add_deps("llaisys-ops-nvidia") + end + if has_config("mx-gpu") then + add_deps("llaisys-ops-metax") + add_links("llaisys-ops-metax") + end set_languages("cxx17") set_warnings("all", "error") if not is_plat("windows") then add_cxflags("-fPIC", "-Wno-unknown-pragmas") end - + add_files("src/ops/*/*.cpp") on_install(function (target) end) target_end() +target("llaisys-models") + set_kind("static") + add_deps("llaisys-tensor") + add_deps("llaisys-ops") + + set_languages("cxx17") + set_warnings("all", "error") + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + end + + add_files("src/models/*/*.cpp") + + on_install(function (target) end) +target_end() + target("llaisys") set_kind("shared") add_deps("llaisys-utils") @@ -102,13 +176,45 @@ target("llaisys") add_deps("llaisys-core") add_deps("llaisys-tensor") add_deps("llaisys-ops") + add_deps("llaisys-models") + add_deps("llaisys-device-cpu") + add_deps("llaisys-ops-cpu") set_languages("cxx17") set_warnings("all", "error") + + if has_config("nv-gpu") then + add_deps("llaisys-device-nvidia") + add_deps("llaisys-ops-nvidia") + end + if has_config("mx-gpu") then + add_deps("llaisys-device-metax") + add_deps("llaisys-ops-metax") + add_links("llaisys-device-metax", "llaisys-ops-metax") + end + + if has_config("nv-gpu") then + -- Use nvcc as the shared library linker for proper CUDA device code linking + set_toolset("sh", "nvcc") + + if is_plat("linux") then + add_syslinks("cudart", "cublas") + add_shflags("-Xcompiler", "-fPIC", "-shared", "-rdc=true", {force = true}) + end + end + + if has_config("mx-gpu") and is_plat("linux") then + local maca_path = get_config("maca-path") or "/opt/maca" + local mxdriver_path = get_config("mxdriver-path") or "/opt/mxdriver" + add_linkdirs(path.join(maca_path, "lib")) + add_linkdirs(path.join(mxdriver_path, "lib")) + add_syslinks("mcruntime", "mxc-runtime64", "runtime_cu", "mxsml", "mcblas") + end + add_files("src/llaisys/*.cc") set_installdir(".") - + after_install(function (target) -- copy shared library to python package print("Copying llaisys to python/llaisys/libllaisys/ ..") @@ -118,5 +224,8 @@ target("llaisys") if is_plat("linux") then os.cp("lib/*.so", "python/llaisys/libllaisys/") end + if is_plat("macosx") then + os.cp("lib/*.dylib", "python/llaisys/libllaisys/") + end end) -target_end() \ No newline at end of file +target_end() diff --git a/xmake/metax.lua b/xmake/metax.lua new file mode 100644 index 000000000..2e7527429 --- /dev/null +++ b/xmake/metax.lua @@ -0,0 +1,200 @@ +-- MetaX MACA backend integration. + +local function add_unique(list, value) + if not value or value == "" then + return + end + for _, item in ipairs(list) do + if item == value then + return + end + end + table.insert(list, value) +end + +local function add_if_dir(list, dir) + if os.isdir(dir) then + add_unique(list, dir) + end +end + +local function sorted_files(pattern) + local files = os.files(pattern) + table.sort(files) + return files +end + +local function build_metax_config() + local projectdir = os.projectdir() + local maca_path = get_config("maca-path") or os.getenv("MACA_HOME") or "/opt/maca" + local mxdriver_path = get_config("mxdriver-path") or "/opt/mxdriver" + + local roots = {} + add_unique(roots, maca_path) + add_unique(roots, os.getenv("MACA_HOME")) + add_unique(roots, "/opt/maca") + add_unique(roots, "/usr/local/maca") + add_unique(roots, "/opt/maca-3.3.0") + add_unique(roots, "/opt/maca-3.2.1") + add_unique(roots, "/opt/maca-3.2.0") + add_unique(roots, "/opt/maca-3.1.0") + + local include_dirs = { + path.join(projectdir, "include"), + path.join(projectdir, "src") + } + local link_dirs = {} + for _, root in ipairs(roots) do + add_if_dir(include_dirs, path.join(root, "include")) + add_if_dir(include_dirs, path.join(root, "tools", "cu-bridge", "include")) + add_if_dir(include_dirs, path.join(root, "mxgpu_llvm", "include")) + + add_if_dir(link_dirs, path.join(root, "lib")) + add_if_dir(link_dirs, path.join(root, "lib64")) + add_if_dir(link_dirs, path.join(root, "mxgpu_llvm", "lib")) + add_if_dir(link_dirs, path.join(root, "mxgpu_llvm", "lib64")) + end + add_if_dir(link_dirs, path.join(mxdriver_path, "lib")) + + local mxcc = os.getenv("MXCC") + if not mxcc or mxcc == "" then + local candidate = path.join(maca_path, "mxgpu_llvm", "bin", "mxcc") + mxcc = os.isfile(candidate) and candidate or "mxcc" + end + + return { + projectdir = projectdir, + maca_path = maca_path, + mxdriver_path = mxdriver_path, + mxcc = mxcc, + include_dirs = include_dirs, + link_dirs = link_dirs, + syslinks = {"mcruntime", "mxc-runtime64", "runtime_cu", "mxsml", "mcblas"}, + common_cxflags = is_plat("windows") and {} or {"-fPIC", "-Wno-unknown-pragmas"}, + maca_compile_flags = { + "-std=c++17", + "-D__CUDACC__", + "-x", "maca", + "-offload-arch", "native", + "--maca-path=" .. maca_path, + "-O3", + "-fPIC", + "-Wno-unknown-pragmas", + "-DENABLE_METAX_API" + } + } +end + +local function configure_current_target(cfg) + set_languages("cxx17") + set_warnings("all", "error") + for _, flag in ipairs(cfg.common_cxflags) do + add_cxflags(flag) + end + for _, includedir in ipairs(cfg.include_dirs) do + add_includedirs(includedir, {public = true}) + end + for _, linkdir in ipairs(cfg.link_dirs) do + add_linkdirs(linkdir, {public = true}) + end + add_syslinks(table.unpack(cfg.syslinks), {public = true}) +end + +local function object_path(cfg, group_name, source) + local basename = path.basename(source) .. ".o" + if group_name == "ops" then + local op_name = path.basename(path.directory(path.directory(source))) + return path.join(cfg.projectdir, "build", "_gen", "metax", group_name, op_name, basename) + end + return path.join(cfg.projectdir, "build", "_gen", "metax", group_name, basename) +end + +local function emit_compile_commands(batchcmds, cfg, sources, objects) + for i, source in ipairs(sources) do + local object = objects[i] + batchcmds:mkdir(path.directory(object)) + + local args = {} + for _, flag in ipairs(cfg.maca_compile_flags) do + table.insert(args, flag) + end + for _, includedir in ipairs(cfg.include_dirs) do + table.insert(args, "-I" .. includedir) + end + table.insert(args, "-c") + table.insert(args, source) + table.insert(args, "-o") + table.insert(args, object) + + batchcmds:vrunv(cfg.mxcc, args) + end +end + +local function emit_archive_commands(batchcmds, target, objects) + local targetfile = target:targetfile() + local ar = target:tool("ar") or "ar" + + batchcmds:mkdir(path.directory(targetfile)) + batchcmds:rm(targetfile) + + local args = {"-cr", targetfile} + for _, object in ipairs(objects) do + table.insert(args, object) + end + batchcmds:vrunv(ar, args) +end + +local function set_target_sources(target, cfg, group_name, pattern) + local sources = sorted_files(path.join(cfg.projectdir, pattern)) + local objects = {} + for _, source in ipairs(sources) do + table.insert(objects, object_path(cfg, group_name, source)) + end + target:data_set("metax_sources", sources) + target:data_set("metax_objects", objects) +end + +local function build_target_sources(target, batchcmds, cfg) + local sources = target:data("metax_sources") or {} + local objects = target:data("metax_objects") or {} + emit_compile_commands(batchcmds, cfg, sources, objects) + emit_archive_commands(batchcmds, target, objects) +end + +local function register_metax_archive_target(cfg, name, deps, group_name, pattern) + target(name) + set_kind("static") + for _, dep in ipairs(deps) do + add_deps(dep) + end + configure_current_target(cfg) + + on_load(function (target) + set_target_sources(target, cfg, group_name, pattern) + end) + + on_buildcmd(function (target, batchcmds, opt) + build_target_sources(target, batchcmds, cfg) + end) + + on_install(function (target) end) + target_end() +end + +local metax = build_metax_config() + +register_metax_archive_target( + metax, + "llaisys-device-metax", + {"llaisys-utils"}, + "device", + "src/device/metax/*.maca" +) + +register_metax_archive_target( + metax, + "llaisys-ops-metax", + {"llaisys-tensor"}, + "ops", + "src/ops/*/metax/*.maca" +) diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua new file mode 100644 index 000000000..22fe63db4 --- /dev/null +++ b/xmake/nvidia.lua @@ -0,0 +1,66 @@ +-- NVIDIA CUDA support for LLAISYS +-- Enable with: xmake f --nv-gpu=y +-- +-- Future extensibility notes: +-- - To add AMD GPU support, create xmake/amd.lua with similar structure +-- - To add other vendors, follow the same pattern: xmake/.lua +-- - Each vendor implements the same LlaisysRuntimeAPI interface + +-- Option: CUDA compute capability (can be overridden via xmake f --cuda-arch=sm_90) +option("cuda-arch") + set_default("sm_80") -- Default to A100 (SM80) + set_showmenu(true) + set_description("CUDA compute capability, e.g., sm_80 for A100, sm_90 for H100") +option_end() + +-- Helper: get compute capability from option +local function get_cuda_arch() + local arch = get_config("cuda-arch") + if arch then + -- Remove sm_ prefix if present, we'll add it back + arch = arch:gsub("^sm_", "") + return arch + end + return "80" -- Default A100 +end + +local cuda_arch = get_cuda_arch() + +-- CUDA device runtime +target("llaisys-device-nvidia") + set_kind("static") + set_languages("cxx17") + + add_rules("cuda") + + -- Support multiple architectures for broader compatibility + add_cuflags("-gencode=arch=compute_" .. cuda_arch .. ",code=sm_" .. cuda_arch) + add_cuflags("-O3") + -- Enable relocatable device code for proper linking with shared library + add_cuflags("-rdc=true") + -- Pass -fPIC to the host compiler through nvcc + add_cuflags("-Xcompiler=-fPIC,-Wno-unknown-pragmas") + + add_files("../src/device/nvidia/*.cu") + + on_install(function (target) end) +target_end() + +-- NVIDIA operators +target("llaisys-ops-nvidia") + set_kind("static") + add_deps("llaisys-device-nvidia") + + set_languages("cxx17") + add_rules("cuda") + add_cuflags("-gencode=arch=compute_" .. cuda_arch .. ",code=sm_" .. cuda_arch) + add_cuflags("-O3") + -- Enable relocatable device code for proper linking with shared library + add_cuflags("-rdc=true") + -- Pass -fPIC to the host compiler through nvcc + add_cuflags("-Xcompiler=-fPIC,-Wno-unknown-pragmas") + + add_files("../src/ops/*/nvidia/*.cu") + + on_install(function (target) end) +target_end()