diff --git a/include/llaisys/runtime.h b/include/llaisys/runtime.h index d8e6f66f1..d59f95fc6 100644 --- a/include/llaisys/runtime.h +++ b/include/llaisys/runtime.h @@ -2,6 +2,7 @@ #define LLAISYS_RUNTIME_H #include "../llaisys.h" +#include "tensor.h" __C { // Runtime API Functions @@ -42,6 +43,18 @@ __C { // Llaisys API for switching device context __export void llaisysSetContextRuntime(llaisysDeviceType_t, int); + + // Distributed runtime APIs + __export void llaisysInitDistributed(int rank, int world_size); + __export void llaisysFinalizeDistributed(); + __export uint8_t llaisysDistributedIsInitialized(); + __export int llaisysDistributedRank(); + __export int llaisysDistributedWorldSize(); + + __export void llaisysDistAllReduce(llaisysTensor_t tensor); + __export llaisysTensor_t llaisysDistAllGather(llaisysTensor_t tensor); + __export void llaisysDistBroadcast(llaisysTensor_t tensor, int root); + __export void llaisysDistBarrier(); } #endif // LLAISYS_RUNTIME_H diff --git a/python/llaisys/__init__.py b/python/llaisys/__init__.py index de8d99f48..5347bd096 100644 --- a/python/llaisys/__init__.py +++ b/python/llaisys/__init__.py @@ -1,4 +1,5 @@ from .runtime import RuntimeAPI +from .runtime import DistributedContext from .libllaisys import DeviceType from .libllaisys import DataType from .libllaisys import MemcpyKind @@ -10,6 +11,7 @@ __all__ = [ "RuntimeAPI", + "DistributedContext", "DeviceType", "DataType", "MemcpyKind", diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb527..f8c630b98 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -2,6 +2,7 @@ import sys import ctypes from pathlib import Path +import torch from .runtime import load_runtime from .runtime import LlaisysRuntimeAPI @@ -12,6 +13,8 @@ from .tensor import llaisysTensor_t from .tensor import load_tensor from .ops import load_ops +from .models import load_models +from .models import LlaisysQwen2Meta, LlaisysQwen2Weights, llaisysQwen2Model_t def load_shared_library(): @@ -38,6 +41,7 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) +load_models(LIB_LLAISYS) __all__ = [ @@ -52,4 +56,7 @@ def load_shared_library(): "llaisysMemcpyKind_t", "MemcpyKind", "llaisysStream_t", + "LlaisysQwen2Meta", + "LlaisysQwen2Weights", + "llaisysQwen2Model_t", ] diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py new file mode 100644 index 000000000..68ea9190d --- /dev/null +++ b/python/llaisys/libllaisys/models.py @@ -0,0 +1,72 @@ +import ctypes +from ctypes import POINTER, c_void_p, c_size_t, c_int, c_int64, c_float, Structure +from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t +from .tensor import llaisysTensor_t + + +# Model handle type +llaisysQwen2Model_t = c_void_p + + +class LlaisysQwen2Meta(Structure): + _fields_ = [ + ("dtype", llaisysDataType_t), + ("nlayer", c_size_t), + ("hs", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("maxseq", c_size_t), + ("voc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_int64), + ] + + +class LlaisysQwen2Weights(Structure): + _fields_ = [ + ("in_embed", llaisysTensor_t), + ("out_embed", llaisysTensor_t), + ("out_norm_w", llaisysTensor_t), + ("attn_norm_w", POINTER(llaisysTensor_t)), + ("attn_q_w", POINTER(llaisysTensor_t)), + ("attn_q_b", POINTER(llaisysTensor_t)), + ("attn_k_w", POINTER(llaisysTensor_t)), + ("attn_k_b", POINTER(llaisysTensor_t)), + ("attn_v_w", POINTER(llaisysTensor_t)), + ("attn_v_b", POINTER(llaisysTensor_t)), + ("attn_o_w", POINTER(llaisysTensor_t)), + ("mlp_norm_w", POINTER(llaisysTensor_t)), + ("mlp_gate_w", POINTER(llaisysTensor_t)), + ("mlp_up_w", POINTER(llaisysTensor_t)), + ("mlp_down_w", POINTER(llaisysTensor_t)), + ] + + +def load_models(lib): + # llaisysQwen2ModelCreate + lib.llaisysQwen2ModelCreate.argtypes = [ + POINTER(LlaisysQwen2Meta), # meta + llaisysDeviceType_t, # device + POINTER(c_int), # device_ids + c_int, # ndevice + ] + lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t + + # llaisysQwen2ModelDestroy + lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelDestroy.restype = None + + # llaisysQwen2ModelWeights + lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights) + + # llaisysQwen2ModelInfer + lib.llaisysQwen2ModelInfer.argtypes = [ + llaisysQwen2Model_t, # model + POINTER(c_int64), # token_ids + c_size_t, # ntoken + ] + lib.llaisysQwen2ModelInfer.restype = c_int64 diff --git a/python/llaisys/libllaisys/runtime.py b/python/llaisys/libllaisys/runtime.py index 3e5b8be5b..b0604c61e 100644 --- a/python/llaisys/libllaisys/runtime.py +++ b/python/llaisys/libllaisys/runtime.py @@ -1,6 +1,7 @@ import ctypes from ctypes import c_void_p, c_size_t, c_int, Structure, CFUNCTYPE from .llaisys_types import * +from .tensor import llaisysTensor_t # Define function pointer types get_device_count_api = CFUNCTYPE(c_int) @@ -46,3 +47,30 @@ def load_runtime(lib): lib.llaisysSetContextRuntime.argtypes = [llaisysDeviceType_t, c_int] lib.llaisysSetContextRuntime.restype = None + + lib.llaisysInitDistributed.argtypes = [c_int, c_int] + lib.llaisysInitDistributed.restype = None + + lib.llaisysFinalizeDistributed.argtypes = [] + lib.llaisysFinalizeDistributed.restype = None + + lib.llaisysDistributedIsInitialized.argtypes = [] + lib.llaisysDistributedIsInitialized.restype = ctypes.c_uint8 + + lib.llaisysDistributedRank.argtypes = [] + lib.llaisysDistributedRank.restype = c_int + + lib.llaisysDistributedWorldSize.argtypes = [] + lib.llaisysDistributedWorldSize.restype = c_int + + lib.llaisysDistAllReduce.argtypes = [llaisysTensor_t] + lib.llaisysDistAllReduce.restype = None + + lib.llaisysDistAllGather.argtypes = [llaisysTensor_t] + lib.llaisysDistAllGather.restype = llaisysTensor_t + + lib.llaisysDistBroadcast.argtypes = [llaisysTensor_t, c_int] + lib.llaisysDistBroadcast.restype = None + + lib.llaisysDistBarrier.argtypes = [] + lib.llaisysDistBarrier.restype = None diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b21..5c21189e0 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,23 +1,203 @@ -from typing import Sequence +from typing import Sequence, List from ..libllaisys import LIB_LLAISYS -from ..libllaisys import DeviceType +from ..libllaisys import DeviceType, DataType +from ..libllaisys import LlaisysQwen2Meta, LlaisysQwen2Weights from pathlib import Path import safetensors +import json +import ctypes +import re class Qwen2: - def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor - + def __init__( + self, + model_path, + device: DeviceType = DeviceType.CPU, + device_id: int = 0, + rank: int = 0, + world_size: int = 1, + ): model_path = Path(model_path) - + + # Load config + config_path = model_path / "config.json" + with open(config_path, "r") as f: + config = json.load(f) + + # Extract config values + self.num_layers = config["num_hidden_layers"] # 28 + self.hidden_size = config["hidden_size"] # 1536 + self.num_heads = config["num_attention_heads"] # 12 + self.num_kv_heads = config["num_key_value_heads"] # 2 + self.head_dim = self.hidden_size // self.num_heads # 128 + self.intermediate_size = config["intermediate_size"] # 8960 + self.max_seq_len = config["max_position_embeddings"] # 131072 + self.vocab_size = config["vocab_size"] # 151936 + self.rms_norm_eps = config["rms_norm_eps"] # 1e-6 + self.rope_theta = config["rope_theta"] # 10000 + self.eos_token_id = config["eos_token_id"] # 151643 + + # Map torch dtype to llaisys dtype + torch_dtype = config.get("torch_dtype", "float32") + if torch_dtype == "bfloat16": + self.dtype = DataType.BF16 + elif torch_dtype == "float16": + self.dtype = DataType.F16 + else: + self.dtype = DataType.F32 + + self.device = device + self.device_id = device_id + self.rank = rank + self.world_size = world_size + + # Create model meta + meta = LlaisysQwen2Meta() + meta.dtype = self.dtype + meta.nlayer = self.num_layers + meta.hs = self.hidden_size + meta.nh = self.num_heads + meta.nkvh = self.num_kv_heads + meta.dh = self.head_dim + meta.di = self.intermediate_size + meta.maxseq = self.max_seq_len + meta.voc = self.vocab_size + meta.epsilon = self.rms_norm_eps + meta.theta = self.rope_theta + meta.end_token = self.eos_token_id + + # Create model + device_ids = (ctypes.c_int * 1)(device_id) + self._model = LIB_LLAISYS.llaisysQwen2ModelCreate( + ctypes.byref(meta), + device, + device_ids, + 1 + ) + + # Get weights pointer + self._weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model).contents + + # Load weights from safetensors + self._load_weights(model_path) + + def __del__(self): + if hasattr(self, "_model") and self._model is not None: + LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model) + self._model = None + + def _load_weights(self, model_path: Path): + """Load weights from safetensors files.""" for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") - for name_ in data_.keys(): - ## TODO: load the model weights - pass + data = safetensors.safe_open(file, framework="pt", device="cpu") + for name in data.keys(): + tensor = data.get_tensor(name) + self._load_weight(name, tensor) + + def _load_weight(self, name: str, tensor): + """Load a single weight tensor.""" + import torch + + # Convert to contiguous and get raw data pointer + tensor = self._slice_weight(name, tensor).contiguous() + data_ptr = tensor.data_ptr() + + # Map weight name to model weight + if name == "model.embed_tokens.weight": + LIB_LLAISYS.tensorLoad(self._weights.in_embed, ctypes.c_void_p(data_ptr)) + elif name == "lm_head.weight": + LIB_LLAISYS.tensorLoad(self._weights.out_embed, ctypes.c_void_p(data_ptr)) + elif name == "model.norm.weight": + LIB_LLAISYS.tensorLoad(self._weights.out_norm_w, ctypes.c_void_p(data_ptr)) + else: + # Parse layer index from name like "model.layers.0.xxx" + match = re.match(r"model\.layers\.(\d+)\.(.*)", name) + if match: + layer_idx = int(match.group(1)) + weight_name = match.group(2) + + if weight_name == "input_layernorm.weight": + LIB_LLAISYS.tensorLoad(self._weights.attn_norm_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.q_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.attn_q_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.q_proj.bias": + LIB_LLAISYS.tensorLoad(self._weights.attn_q_b[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.k_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.attn_k_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.k_proj.bias": + LIB_LLAISYS.tensorLoad(self._weights.attn_k_b[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.v_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.attn_v_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.v_proj.bias": + LIB_LLAISYS.tensorLoad(self._weights.attn_v_b[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "self_attn.o_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.attn_o_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "post_attention_layernorm.weight": + LIB_LLAISYS.tensorLoad(self._weights.mlp_norm_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "mlp.gate_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.mlp_gate_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "mlp.up_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.mlp_up_w[layer_idx], ctypes.c_void_p(data_ptr)) + elif weight_name == "mlp.down_proj.weight": + LIB_LLAISYS.tensorLoad(self._weights.mlp_down_w[layer_idx], ctypes.c_void_p(data_ptr)) + + def _slice_weight(self, name: str, tensor): + if self.world_size == 1: + return tensor + + def shard_dim(t, dim): + assert t.shape[dim] % self.world_size == 0 + shard = t.shape[dim] // self.world_size + start = self.rank * shard + end = start + shard + if dim == 0: + return t[start:end] + if dim == 1: + return t[:, start:end] + raise ValueError(f"Unsupported shard dim: {dim}") + + if name in {"model.embed_tokens.weight", "lm_head.weight"}: + return shard_dim(tensor, 0) + if name in {"model.norm.weight"}: + return tensor + + match = re.match(r"model\.layers\.(\d+)\.(.*)", name) + if not match: + return tensor + + weight_name = match.group(2) + if weight_name in { + "input_layernorm.weight", + "post_attention_layernorm.weight", + }: + return tensor + if weight_name in { + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + "mlp.gate_proj.weight", + "mlp.up_proj.weight", + }: + return shard_dim(tensor, 0) + if weight_name in { + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", + }: + assert tensor.shape[0] % self.world_size == 0 + shard = tensor.shape[0] // self.world_size + start = self.rank * shard + end = start + shard + return tensor[start:end] + if weight_name in { + "self_attn.o_proj.weight", + "mlp.down_proj.weight", + }: + return shard_dim(tensor, 1) + return tensor def generate( self, @@ -26,8 +206,36 @@ def generate( top_k: int = 1, top_p: float = 0.8, temperature: float = 0.8, - ): - - # TODO: Implement generate function - - return [] + ) -> List[int]: + """Generate tokens using argmax sampling (for testing).""" + + if max_new_tokens is None: + max_new_tokens = 128 + + # Convert inputs to list + tokens = list(inputs) + + # First inference with all input tokens + input_array = (ctypes.c_int64 * len(tokens))(*tokens) + next_token = LIB_LLAISYS.llaisysQwen2ModelInfer( + self._model, + input_array, + len(tokens) + ) + tokens.append(next_token) + + # Generate remaining tokens one by one + for _ in range(max_new_tokens - 1): + if next_token == self.eos_token_id: + break + + # Inference with single token + input_array = (ctypes.c_int64 * 1)(next_token) + next_token = LIB_LLAISYS.llaisysQwen2ModelInfer( + self._model, + input_array, + 1 + ) + tokens.append(next_token) + + return tokens diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py index ed0180bc8..997a268b8 100644 --- a/python/llaisys/ops.py +++ b/python/llaisys/ops.py @@ -21,7 +21,7 @@ def embedding(out: Tensor, index: Tensor, weight: Tensor): @staticmethod def linear(out: Tensor, inp: Tensor, weight: Tensor, bias: Tensor): LIB_LLAISYS.llaisysLinear( - out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), bias.lib_tensor() + out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), None if bias is None else bias.lib_tensor() ) @staticmethod diff --git a/python/llaisys/runtime.py b/python/llaisys/runtime.py index 15be1aa17..d8f9d5a6f 100644 --- a/python/llaisys/runtime.py +++ b/python/llaisys/runtime.py @@ -1,6 +1,7 @@ from . import libllaisys from .libllaisys import LIB_LLAISYS from ctypes import c_void_p +from .tensor import Tensor class RuntimeAPI: @@ -66,3 +67,32 @@ def memcpy_async( self._api.contents.memcpy_async( dst, src, size, libllaisys.llaisysMemcpyKind_t(kind), stream ) + + +class DistributedContext: + def init(self, rank: int, world_size: int) -> None: + LIB_LLAISYS.llaisysInitDistributed(rank, world_size) + + def finalize(self) -> None: + LIB_LLAISYS.llaisysFinalizeDistributed() + + def is_initialized(self) -> bool: + return bool(LIB_LLAISYS.llaisysDistributedIsInitialized()) + + def rank(self) -> int: + return int(LIB_LLAISYS.llaisysDistributedRank()) + + def world_size(self) -> int: + return int(LIB_LLAISYS.llaisysDistributedWorldSize()) + + def all_reduce(self, tensor: Tensor) -> None: + LIB_LLAISYS.llaisysDistAllReduce(tensor.lib_tensor()) + + def all_gather(self, tensor: Tensor) -> Tensor: + return Tensor(tensor=LIB_LLAISYS.llaisysDistAllGather(tensor.lib_tensor())) + + def broadcast(self, tensor: Tensor, root: int = 0) -> None: + LIB_LLAISYS.llaisysDistBroadcast(tensor.lib_tensor(), root) + + def barrier(self) -> None: + LIB_LLAISYS.llaisysDistBarrier() diff --git a/src/core/context/context.cpp b/src/core/context/context.cpp index 44894b9e7..418a9edef 100644 --- a/src/core/context/context.cpp +++ b/src/core/context/context.cpp @@ -1,5 +1,9 @@ #include "context.hpp" +#include "../../tensor/tensor.hpp" #include "../../utils.hpp" +#ifdef ENABLE_NVIDIA_API +#include "../../device/nvidia/nccl_context.hpp" +#endif #include namespace llaisys::core { @@ -32,6 +36,8 @@ Context::Context() { } Context::~Context() { + finalizeDistributed(); + // Destroy current runtime first. delete _current_runtime; @@ -52,7 +58,7 @@ Context::~Context() { void Context::setDevice(llaisysDeviceType_t device_type, int device_id) { // If doest not match the current runtime. if (_current_runtime == nullptr || _current_runtime->deviceType() != device_type || _current_runtime->deviceId() != device_id) { - auto runtimes = _runtime_map[device_type]; + auto &runtimes = _runtime_map[device_type]; CHECK_ARGUMENT((size_t)device_id < runtimes.size() && device_id >= 0, "invalid device id"); if (_current_runtime != nullptr) { _current_runtime->_deactivate(); @@ -70,6 +76,115 @@ Runtime &Context::runtime() { return *_current_runtime; } +void Context::initDistributed(int rank, int world_size) { +#ifdef ENABLE_NVIDIA_API + setDevice(LLAISYS_DEVICE_NVIDIA, rank); + _dist_context = std::make_unique(rank, world_size, runtime().deviceId()); + return; +#else + (void)rank; + (void)world_size; + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + +void Context::finalizeDistributed() { +#ifdef ENABLE_NVIDIA_API + _dist_context.reset(); +#endif +} + +bool Context::distributedInitialized() const { +#ifdef ENABLE_NVIDIA_API + return _dist_context != nullptr; +#else + return false; +#endif +} + +int Context::distributedRank() const { +#ifdef ENABLE_NVIDIA_API + ASSERT(_dist_context != nullptr, "Distributed context is not initialized."); + return _dist_context->rank(); +#else + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + +int Context::distributedWorldSize() const { +#ifdef ENABLE_NVIDIA_API + ASSERT(_dist_context != nullptr, "Distributed context is not initialized."); + return _dist_context->worldSize(); +#else + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + +void Context::allReduce(const tensor_t &tensor) { + ASSERT(tensor != nullptr, "allReduce tensor must not be null."); + ASSERT(tensor->isContiguous(), "allReduce requires a contiguous tensor."); + ASSERT(tensor->deviceType() == LLAISYS_DEVICE_NVIDIA, "Distributed collectives currently support NVIDIA tensors only."); +#ifdef ENABLE_NVIDIA_API + ASSERT(_dist_context != nullptr, "Distributed context is not initialized."); + setDevice(tensor->deviceType(), tensor->deviceId()); + ASSERT(runtime().deviceId() == _dist_context->deviceId(), "Tensor device does not match distributed communicator device."); + _dist_context->allReduce(tensor->data(), tensor->numel(), tensor->dtype(), runtime().stream()); + runtime().synchronize(); +#else + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + +tensor_t Context::allGather(const tensor_t &tensor) { + ASSERT(tensor != nullptr, "allGather tensor must not be null."); + ASSERT(tensor->isContiguous(), "allGather requires a contiguous tensor."); + ASSERT(tensor->deviceType() == LLAISYS_DEVICE_NVIDIA, "Distributed collectives currently support NVIDIA tensors only."); +#ifdef ENABLE_NVIDIA_API + ASSERT(_dist_context != nullptr, "Distributed context is not initialized."); + setDevice(tensor->deviceType(), tensor->deviceId()); + ASSERT(runtime().deviceId() == _dist_context->deviceId(), "Tensor device does not match distributed communicator device."); + + std::vector gathered_shape; + gathered_shape.reserve(tensor->ndim() + 1); + gathered_shape.push_back(static_cast(_dist_context->worldSize())); + gathered_shape.insert(gathered_shape.end(), tensor->shape().begin(), tensor->shape().end()); + + auto gathered = llaisys::Tensor::create(gathered_shape, tensor->dtype(), tensor->deviceType(), tensor->deviceId()); + _dist_context->allGather(tensor->data(), gathered->data(), tensor->numel(), tensor->dtype(), runtime().stream()); + runtime().synchronize(); + return gathered; +#else + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + +void Context::broadcast(const tensor_t &tensor, int root) { + ASSERT(tensor != nullptr, "broadcast tensor must not be null."); + ASSERT(tensor->isContiguous(), "broadcast requires a contiguous tensor."); + ASSERT(tensor->deviceType() == LLAISYS_DEVICE_NVIDIA, "Distributed collectives currently support NVIDIA tensors only."); +#ifdef ENABLE_NVIDIA_API + ASSERT(_dist_context != nullptr, "Distributed context is not initialized."); + setDevice(tensor->deviceType(), tensor->deviceId()); + ASSERT(runtime().deviceId() == _dist_context->deviceId(), "Tensor device does not match distributed communicator device."); + _dist_context->broadcast(tensor->data(), tensor->numel(), tensor->dtype(), root, runtime().stream()); + runtime().synchronize(); +#else + (void)root; + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + +void Context::barrier() { +#ifdef ENABLE_NVIDIA_API + ASSERT(_dist_context != nullptr, "Distributed context is not initialized."); + setDevice(LLAISYS_DEVICE_NVIDIA, _dist_context->deviceId()); + _dist_context->barrier(runtime().stream()); + runtime().synchronize(); +#else + EXCEPTION_UNSUPPORTED_DEVICE; +#endif +} + // Global API to get thread-local context. Context &context() { thread_local Context thread_context; diff --git a/src/core/context/context.hpp b/src/core/context/context.hpp index a3ebcdecf..7804d2b4c 100644 --- a/src/core/context/context.hpp +++ b/src/core/context/context.hpp @@ -6,14 +6,29 @@ #include "../runtime/runtime.hpp" +#include #include #include +namespace llaisys { +class Tensor; +using tensor_t = std::shared_ptr; +} + +#ifdef ENABLE_NVIDIA_API +namespace llaisys::device::nvidia { +class NcclContext; +} +#endif + namespace llaisys::core { class Context { private: std::unordered_map> _runtime_map; - Runtime *_current_runtime; + Runtime *_current_runtime = nullptr; +#ifdef ENABLE_NVIDIA_API + std::unique_ptr _dist_context; +#endif Context(); public: @@ -30,6 +45,17 @@ class Context { void setDevice(llaisysDeviceType_t device_type, int device_id); Runtime &runtime(); + void initDistributed(int rank, int world_size); + void finalizeDistributed(); + bool distributedInitialized() const; + int distributedRank() const; + int distributedWorldSize() const; + + void allReduce(const tensor_t &tensor); + tensor_t allGather(const tensor_t &tensor); + void broadcast(const tensor_t &tensor, int root); + void barrier(); + friend Context &context(); }; } // namespace llaisys::core diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 7f03a8622..f9b28248b 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -7,6 +7,7 @@ namespace llaisys::core { Runtime::Runtime(llaisysDeviceType_t device_type, int device_id) : _device_type(device_type), _device_id(device_id), _is_active(false) { _api = llaisys::device::getRuntimeAPI(_device_type); + _api->set_device(_device_id); _stream = _api->create_stream(); _allocator = new allocators::NaiveAllocator(_api); } diff --git a/src/device/nvidia/cuda_utils.cuh b/src/device/nvidia/cuda_utils.cuh new file mode 100644 index 000000000..e9fcf762b --- /dev/null +++ b/src/device/nvidia/cuda_utils.cuh @@ -0,0 +1,109 @@ +#pragma once + +#include "../../utils.hpp" + +#include +#include +#include +#include + +#include +#include + +namespace llaisys::device::nvidia { + +#define CUDA_CHECK(EXPR__) \ + do { \ + cudaError_t err__ = (EXPR__); \ + if (err__ != cudaSuccess) { \ + std::cerr << "[ERROR] CUDA call failed: " << cudaGetErrorString(err__) << " (" \ + << static_cast(err__) << ")" << EXCEPTION_LOCATION_MSG << std::endl; \ + throw std::runtime_error(cudaGetErrorString(err__)); \ + } \ + } while (0) + +#define NCCL_CHECK(EXPR__) \ + do { \ + ncclResult_t err__ = (EXPR__); \ + if (err__ != ncclSuccess) { \ + std::cerr << "[ERROR] NCCL call failed: " << ncclGetErrorString(err__) << " (" \ + << static_cast(err__) << ")" << EXCEPTION_LOCATION_MSG << std::endl; \ + throw std::runtime_error(ncclGetErrorString(err__)); \ + } \ + } while (0) + +inline cudaMemcpyKind toCudaMemcpyKind(llaisysMemcpyKind_t kind) { + switch (kind) { + case LLAISYS_MEMCPY_H2H: + return cudaMemcpyHostToHost; + case LLAISYS_MEMCPY_H2D: + return cudaMemcpyHostToDevice; + case LLAISYS_MEMCPY_D2H: + return cudaMemcpyDeviceToHost; + case LLAISYS_MEMCPY_D2D: + return cudaMemcpyDeviceToDevice; + default: + CHECK_ARGUMENT(false, "invalid memcpy kind"); + return cudaMemcpyDefault; + } +} + +inline cudaStream_t toCudaStream(llaisysStream_t stream) { + return reinterpret_cast(stream); +} + +template +__device__ inline float scalarToFloat(T value); + +template <> +__device__ inline float scalarToFloat(float value) { + return value; +} + +template <> +__device__ inline float scalarToFloat(llaisys::fp16_t value) { + union { + uint16_t u; + __half h; + } caster{value._v}; + return __half2float(caster.h); +} + +template <> +__device__ inline float scalarToFloat(llaisys::bf16_t value) { + union { + uint16_t u; + __nv_bfloat16 b; + } caster{value._v}; + return __bfloat162float(caster.b); +} + +template +__device__ inline T floatToScalar(float value); + +template <> +__device__ inline float floatToScalar(float value) { + return value; +} + +template <> +__device__ inline llaisys::fp16_t floatToScalar(float value) { + union { + uint16_t u; + __half h; + } caster; + caster.h = __float2half_rn(value); + return llaisys::fp16_t{caster.u}; +} + +template <> +__device__ inline llaisys::bf16_t floatToScalar(float value) { + union { + uint16_t u; + __nv_bfloat16 b; + } caster; + caster.b = __float2bfloat16(value); + return llaisys::bf16_t{caster.u}; +} + +} // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nccl_context.cu b/src/device/nvidia/nccl_context.cu new file mode 100644 index 000000000..136545cec --- /dev/null +++ b/src/device/nvidia/nccl_context.cu @@ -0,0 +1,221 @@ +#include "nccl_context.hpp" +#include "cuda_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llaisys::device::nvidia { +namespace { + +std::string sanitizeForPath(const std::string &value) { + std::string sanitized = value; + for (char &ch : sanitized) { + const bool keep = (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch == '-' || ch == '_'; + if (!keep) { + ch = '_'; + } + } + return sanitized; +} + +std::string envOrEmpty(const char *name) { + const char *value = std::getenv(name); + if (value == nullptr) { + return {}; + } + return value; +} + +} // namespace + +ncclDataType_t toNcclDataType(llaisysDataType_t dtype) { + switch (dtype) { + case LLAISYS_DTYPE_BOOL: + return ncclUint8; + case LLAISYS_DTYPE_I8: + return ncclInt8; + case LLAISYS_DTYPE_I32: + return ncclInt32; + case LLAISYS_DTYPE_I64: + return ncclInt64; + case LLAISYS_DTYPE_U8: + case LLAISYS_DTYPE_BYTE: + return ncclUint8; + case LLAISYS_DTYPE_U32: + return ncclUint32; + case LLAISYS_DTYPE_U64: + return ncclUint64; + case LLAISYS_DTYPE_F16: + return ncclFloat16; + case LLAISYS_DTYPE_F32: + return ncclFloat32; + case LLAISYS_DTYPE_F64: + return ncclFloat64; + case LLAISYS_DTYPE_BF16: + return ncclBfloat16; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} + +ncclRedOp_t toNcclReduceOp(NcclReduceOp op) { + switch (op) { + case NcclReduceOp::Sum: + return ncclSum; + default: + throw std::invalid_argument("Unsupported NCCL reduce op"); + } +} + +std::string resolveBootstrapPath() { + const std::string explicit_path = envOrEmpty("LLAISYS_DIST_BOOTSTRAP_PATH"); + if (!explicit_path.empty()) { + return explicit_path; + } + + const std::string dist_id = envOrEmpty("LLAISYS_DIST_ID"); + if (!dist_id.empty()) { + return "/tmp/llaisys_nccl_" + sanitizeForPath(dist_id) + ".bin"; + } + + const std::string master_addr = envOrEmpty("MASTER_ADDR"); + const std::string master_port = envOrEmpty("MASTER_PORT"); + if (!master_addr.empty() && !master_port.empty()) { + return "/tmp/llaisys_nccl_" + sanitizeForPath(master_addr) + "_" + sanitizeForPath(master_port) + ".bin"; + } + + throw std::invalid_argument( + "Distributed NCCL init requires LLAISYS_DIST_BOOTSTRAP_PATH, LLAISYS_DIST_ID, or MASTER_ADDR/MASTER_PORT."); +} + +void storeUniqueId(const std::string &path, const ncclUniqueId &unique_id) { + const std::filesystem::path file_path(path); + const std::filesystem::path dir_path = file_path.parent_path(); + if (!dir_path.empty()) { + std::filesystem::create_directories(dir_path); + } + + std::error_code remove_ec; + std::filesystem::remove(file_path, remove_ec); + + std::ostringstream temp_path_builder; + temp_path_builder << path << ".tmp." << ::getpid(); + const std::string temp_path = temp_path_builder.str(); + + { + std::ofstream output(temp_path, std::ios::binary | std::ios::trunc); + CHECK_ARGUMENT(output.good(), "failed to open NCCL bootstrap temp file"); + output.write(reinterpret_cast(&unique_id), sizeof(unique_id)); + CHECK_ARGUMENT(output.good(), "failed to write NCCL bootstrap data"); + output.flush(); + CHECK_ARGUMENT(output.good(), "failed to flush NCCL bootstrap data"); + } + + std::filesystem::rename(temp_path, path); +} + +ncclUniqueId loadUniqueId(int rank) { + const std::string path = resolveBootstrapPath(); + ncclUniqueId unique_id{}; + if (rank == 0) { + NCCL_CHECK(ncclGetUniqueId(&unique_id)); + storeUniqueId(path, unique_id); + return unique_id; + } + + constexpr int kMaxAttempts = 600; + constexpr auto kPollInterval = std::chrono::milliseconds(100); + constexpr auto kFreshWindow = std::chrono::seconds(5); + for (int attempt = 0; attempt < kMaxAttempts; ++attempt) { + std::error_code ec; + const bool exists = std::filesystem::exists(path, ec); + const bool size_ok = exists && !ec && std::filesystem::file_size(path, ec) == sizeof(unique_id); + const auto min_write_time = std::filesystem::file_time_type::clock::now() - kFreshWindow; + const bool fresh_enough = size_ok && !ec && std::filesystem::last_write_time(path, ec) >= min_write_time; + if (fresh_enough && !ec) { + std::ifstream input(path, std::ios::binary); + if (input.good()) { + input.read(reinterpret_cast(&unique_id), sizeof(unique_id)); + if (input.good()) { + return unique_id; + } + } + } + std::this_thread::sleep_for(kPollInterval); + } + + throw std::runtime_error("Timed out waiting for NCCL bootstrap file"); +} + +NcclContext::NcclContext(int rank, int world_size, int device_id) + : _rank(rank), + _world_size(world_size), + _device_id(device_id), + _comm(nullptr), + _barrier_buffer(nullptr) { + CHECK_ARGUMENT(world_size > 0, "world_size must be positive"); + CHECK_ARGUMENT(rank >= 0 && rank < world_size, "rank must be in [0, world_size)"); + CHECK_ARGUMENT(device_id >= 0, "device_id must be non-negative"); + + CUDA_CHECK(cudaSetDevice(device_id)); + const ncclUniqueId unique_id = loadUniqueId(rank); + NCCL_CHECK(ncclCommInitRank(&_comm, world_size, unique_id, rank)); + CUDA_CHECK(cudaMalloc(&_barrier_buffer, sizeof(int32_t))); +} + +NcclContext::~NcclContext() { + if (_barrier_buffer != nullptr) { + cudaFree(_barrier_buffer); + _barrier_buffer = nullptr; + } + if (_comm != nullptr) { + ncclCommDestroy(_comm); + _comm = nullptr; + } +} + +int NcclContext::rank() const { + return _rank; +} + +int NcclContext::worldSize() const { + return _world_size; +} + +int NcclContext::deviceId() const { + return _device_id; +} + +void NcclContext::allReduce(void *data, size_t count, llaisysDataType_t dtype, llaisysStream_t stream, NcclReduceOp op) const { + CHECK_ARGUMENT(data != nullptr || count == 0, "allReduce data must not be null"); + std::lock_guard lock(_mutex); + NCCL_CHECK(ncclAllReduce(data, data, count, toNcclDataType(dtype), toNcclReduceOp(op), _comm, toCudaStream(stream))); +} + +void NcclContext::allGather(const void *send_buffer, void *recv_buffer, size_t count, llaisysDataType_t dtype, llaisysStream_t stream) const { + CHECK_ARGUMENT(send_buffer != nullptr || count == 0, "allGather send buffer must not be null"); + CHECK_ARGUMENT(recv_buffer != nullptr || count == 0, "allGather recv buffer must not be null"); + std::lock_guard lock(_mutex); + NCCL_CHECK(ncclAllGather(send_buffer, recv_buffer, count, toNcclDataType(dtype), _comm, toCudaStream(stream))); +} + +void NcclContext::broadcast(void *data, size_t count, llaisysDataType_t dtype, int root, llaisysStream_t stream) const { + CHECK_ARGUMENT(data != nullptr || count == 0, "broadcast data must not be null"); + CHECK_ARGUMENT(root >= 0 && root < _world_size, "broadcast root must be in [0, world_size)"); + std::lock_guard lock(_mutex); + NCCL_CHECK(ncclBroadcast(data, data, count, toNcclDataType(dtype), root, _comm, toCudaStream(stream))); +} + +void NcclContext::barrier(llaisysStream_t stream) const { + std::lock_guard lock(_mutex); + CUDA_CHECK(cudaMemsetAsync(_barrier_buffer, 0, sizeof(int32_t), toCudaStream(stream))); + NCCL_CHECK(ncclAllReduce(_barrier_buffer, _barrier_buffer, 1, ncclInt32, ncclSum, _comm, toCudaStream(stream))); +} + +} // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nccl_context.hpp b/src/device/nvidia/nccl_context.hpp new file mode 100644 index 000000000..f46828b13 --- /dev/null +++ b/src/device/nvidia/nccl_context.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +struct ncclComm; +typedef ncclComm *ncclComm_t; + +namespace llaisys::device::nvidia { + +enum class NcclReduceOp { + Sum = 0, +}; + +class NcclContext { +private: + int _rank; + int _world_size; + int _device_id; + ncclComm_t _comm; + void *_barrier_buffer; + mutable std::mutex _mutex; + +public: + NcclContext(int rank, int world_size, int device_id); + ~NcclContext(); + + NcclContext(const NcclContext &) = delete; + NcclContext &operator=(const NcclContext &) = delete; + NcclContext(NcclContext &&) = delete; + NcclContext &operator=(NcclContext &&) = delete; + + int rank() const; + int worldSize() const; + int deviceId() const; + + void allReduce(void *data, size_t count, llaisysDataType_t dtype, llaisysStream_t stream, NcclReduceOp op = NcclReduceOp::Sum) const; + void allGather(const void *send_buffer, void *recv_buffer, size_t count, llaisysDataType_t dtype, llaisysStream_t stream) const; + void broadcast(void *data, size_t count, llaisysDataType_t dtype, int root, llaisysStream_t stream) const; + void barrier(llaisysStream_t stream) const; +}; + +} // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_resource.cu b/src/device/nvidia/nvidia_resource.cu index 2e63647e5..8144b5922 100644 --- a/src/device/nvidia/nvidia_resource.cu +++ b/src/device/nvidia/nvidia_resource.cu @@ -1,7 +1,40 @@ #include "nvidia_resource.cuh" +#include "cuda_utils.cuh" + +#include namespace llaisys::device::nvidia { -Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {} +Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id), _cublas_handle(nullptr) { + CUDA_CHECK(cudaSetDevice(device_id)); + cublasStatus_t status = cublasCreate(&_cublas_handle); + if (status != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("Failed to create cuBLAS handle"); + } +} + +Resource::~Resource() { + if (_cublas_handle != nullptr) { + cublasDestroy(_cublas_handle); + _cublas_handle = nullptr; + } +} + +cublasHandle_t Resource::cublasHandle() const { + return _cublas_handle; +} + +Resource &resource(int device_id) { + static std::mutex resource_mutex; + static std::unordered_map> resources; + + std::lock_guard lock(resource_mutex); + auto it = resources.find(device_id); + if (it == resources.end()) { + auto inserted = resources.emplace(device_id, std::make_unique(device_id)); + it = inserted.first; + } + return *it->second; +} } // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_resource.cuh b/src/device/nvidia/nvidia_resource.cuh index a3002170b..085320c03 100644 --- a/src/device/nvidia/nvidia_resource.cuh +++ b/src/device/nvidia/nvidia_resource.cuh @@ -1,11 +1,23 @@ #pragma once #include "../device_resource.hpp" +#include + +#include +#include +#include namespace llaisys::device::nvidia { class Resource : public llaisys::device::DeviceResource { +private: + cublasHandle_t _cublas_handle; + public: Resource(int device_id); ~Resource(); + + cublasHandle_t cublasHandle() const; }; + +Resource &resource(int device_id); } // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu index cab928261..2cdd7ae46 100644 --- a/src/device/nvidia/nvidia_runtime_api.cu +++ b/src/device/nvidia/nvidia_runtime_api.cu @@ -1,56 +1,98 @@ #include "../runtime_api.hpp" -#include #include +#include "cuda_utils.cuh" namespace llaisys::device::nvidia { namespace runtime_api { int getDeviceCount() { - TO_BE_IMPLEMENTED(); + int count = 0; + CUDA_CHECK(cudaGetDeviceCount(&count)); + return count; } -void setDevice(int) { - TO_BE_IMPLEMENTED(); +void setDevice(int device_id) { + CUDA_CHECK(cudaSetDevice(device_id)); } void deviceSynchronize() { - TO_BE_IMPLEMENTED(); + CUDA_CHECK(cudaDeviceSynchronize()); } llaisysStream_t createStream() { - TO_BE_IMPLEMENTED(); + cudaStream_t stream = nullptr; + CUDA_CHECK(cudaStreamCreate(&stream)); + return reinterpret_cast(stream); } void destroyStream(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + if (stream == nullptr) { + return; + } + CUDA_CHECK(cudaStreamDestroy(toCudaStream(stream))); } + void streamSynchronize(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + if (stream == nullptr) { + CUDA_CHECK(cudaDeviceSynchronize()); + return; + } + CUDA_CHECK(cudaStreamSynchronize(toCudaStream(stream))); } void *mallocDevice(size_t size) { - TO_BE_IMPLEMENTED(); + if (size == 0) { + return nullptr; + } + void *ptr = nullptr; + CUDA_CHECK(cudaMalloc(&ptr, size)); + return ptr; } void freeDevice(void *ptr) { - TO_BE_IMPLEMENTED(); + if (ptr == nullptr) { + return; + } + CUDA_CHECK(cudaFree(ptr)); } void *mallocHost(size_t size) { - TO_BE_IMPLEMENTED(); + if (size == 0) { + return nullptr; + } + void *ptr = nullptr; + CUDA_CHECK(cudaMallocHost(&ptr, size)); + return ptr; } void freeHost(void *ptr) { - TO_BE_IMPLEMENTED(); + if (ptr == nullptr) { + return; + } + CUDA_CHECK(cudaFreeHost(ptr)); } void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); + if (size == 0) { + return; + } + if (kind == LLAISYS_MEMCPY_H2H) { + std::memcpy(dst, src, size); + return; + } + CUDA_CHECK(cudaMemcpy(dst, src, size, toCudaMemcpyKind(kind))); } -void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); +void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) { + if (size == 0) { + return; + } + if (kind == LLAISYS_MEMCPY_H2H) { + std::memcpy(dst, src, size); + return; + } + CUDA_CHECK(cudaMemcpyAsync(dst, src, size, toCudaMemcpyKind(kind), toCudaStream(stream))); } static const LlaisysRuntimeAPI RUNTIME_API = { diff --git a/src/llaisys/models.cc b/src/llaisys/models.cc new file mode 100644 index 000000000..b19ca1723 --- /dev/null +++ b/src/llaisys/models.cc @@ -0,0 +1,128 @@ +#include "llaisys/models/qwen2.h" + +#include "llaisys_tensor.hpp" +#include "../models/qwen2/qwen2.hpp" + +__C { + +struct LlaisysQwen2Model { + llaisys::models::Qwen2Model* model; + LlaisysQwen2Weights weights; +}; + +__export struct LlaisysQwen2Model* llaisysQwen2ModelCreate( + const LlaisysQwen2Meta* meta, + llaisysDeviceType_t device, + int* device_ids, + int ndevice) { + + // Create config from meta + llaisys::models::Qwen2Config config; + config.dtype = meta->dtype; + config.num_layers = meta->nlayer; + config.hidden_size = meta->hs; + config.num_heads = meta->nh; + config.num_kv_heads = meta->nkvh; + config.head_dim = meta->dh; + config.intermediate_size = meta->di; + config.max_seq_len = meta->maxseq; + config.vocab_size = meta->voc; + config.rms_norm_eps = meta->epsilon; + config.rope_theta = meta->theta; + config.eos_token_id = meta->end_token; + + // For now, use first device + int device_id = (ndevice > 0 && device_ids != nullptr) ? device_ids[0] : 0; + + auto* wrapper = new LlaisysQwen2Model(); + wrapper->model = new llaisys::models::Qwen2Model(config, device, device_id); + + // Set up weight pointers + auto& model = *wrapper->model; + wrapper->weights.in_embed = new LlaisysTensor{model.embedTokens()}; + wrapper->weights.out_embed = new LlaisysTensor{model.lmHead()}; + wrapper->weights.out_norm_w = new LlaisysTensor{model.normWeight()}; + + size_t nlayer = config.num_layers; + wrapper->weights.attn_norm_w = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_q_w = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_q_b = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_k_w = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_k_b = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_v_w = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_v_b = new llaisysTensor_t[nlayer]; + wrapper->weights.attn_o_w = new llaisysTensor_t[nlayer]; + wrapper->weights.mlp_norm_w = new llaisysTensor_t[nlayer]; + wrapper->weights.mlp_gate_w = new llaisysTensor_t[nlayer]; + wrapper->weights.mlp_up_w = new llaisysTensor_t[nlayer]; + wrapper->weights.mlp_down_w = new llaisysTensor_t[nlayer]; + + for (size_t i = 0; i < nlayer; ++i) { + wrapper->weights.attn_norm_w[i] = new LlaisysTensor{model.inputLayernormWeight(i)}; + wrapper->weights.attn_q_w[i] = new LlaisysTensor{model.qProjWeight(i)}; + wrapper->weights.attn_q_b[i] = new LlaisysTensor{model.qProjBias(i)}; + wrapper->weights.attn_k_w[i] = new LlaisysTensor{model.kProjWeight(i)}; + wrapper->weights.attn_k_b[i] = new LlaisysTensor{model.kProjBias(i)}; + wrapper->weights.attn_v_w[i] = new LlaisysTensor{model.vProjWeight(i)}; + wrapper->weights.attn_v_b[i] = new LlaisysTensor{model.vProjBias(i)}; + wrapper->weights.attn_o_w[i] = new LlaisysTensor{model.oProjWeight(i)}; + wrapper->weights.mlp_norm_w[i] = new LlaisysTensor{model.postAttnLayernormWeight(i)}; + wrapper->weights.mlp_gate_w[i] = new LlaisysTensor{model.gateProjWeight(i)}; + wrapper->weights.mlp_up_w[i] = new LlaisysTensor{model.upProjWeight(i)}; + wrapper->weights.mlp_down_w[i] = new LlaisysTensor{model.downProjWeight(i)}; + } + + return wrapper; +} + +__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model* model) { + if (model == nullptr) return; + + size_t nlayer = model->model->config().num_layers; + + // Delete weight wrappers + delete model->weights.in_embed; + delete model->weights.out_embed; + delete model->weights.out_norm_w; + + for (size_t i = 0; i < nlayer; ++i) { + delete model->weights.attn_norm_w[i]; + delete model->weights.attn_q_w[i]; + delete model->weights.attn_q_b[i]; + delete model->weights.attn_k_w[i]; + delete model->weights.attn_k_b[i]; + delete model->weights.attn_v_w[i]; + delete model->weights.attn_v_b[i]; + delete model->weights.attn_o_w[i]; + delete model->weights.mlp_norm_w[i]; + delete model->weights.mlp_gate_w[i]; + delete model->weights.mlp_up_w[i]; + delete model->weights.mlp_down_w[i]; + } + + delete[] model->weights.attn_norm_w; + delete[] model->weights.attn_q_w; + delete[] model->weights.attn_q_b; + delete[] model->weights.attn_k_w; + delete[] model->weights.attn_k_b; + delete[] model->weights.attn_v_w; + delete[] model->weights.attn_v_b; + delete[] model->weights.attn_o_w; + delete[] model->weights.mlp_norm_w; + delete[] model->weights.mlp_gate_w; + delete[] model->weights.mlp_up_w; + delete[] model->weights.mlp_down_w; + + delete model->model; + delete model; +} + +__export struct LlaisysQwen2Weights* llaisysQwen2ModelWeights(struct LlaisysQwen2Model* model) { + return &model->weights; +} + +__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model* model, int64_t* token_ids, size_t ntoken) { + return model->model->infer(token_ids, ntoken); +} + +} diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc index c99fbc32f..3a1a8a7c4 100644 --- a/src/llaisys/ops.cc +++ b/src/llaisys/ops.cc @@ -23,7 +23,7 @@ __C { llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor); } void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias) { - llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor); + llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias == nullptr ? nullptr : bias->tensor); } void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in) { llaisys::ops::rearrange(out->tensor, in->tensor); diff --git a/src/llaisys/runtime.cc b/src/llaisys/runtime.cc index 7b00ff1bb..7ef917c0d 100644 --- a/src/llaisys/runtime.cc +++ b/src/llaisys/runtime.cc @@ -1,4 +1,5 @@ #include "llaisys/runtime.h" +#include "llaisys_tensor.hpp" #include "../core/context/context.hpp" #include "../device/runtime_api.hpp" @@ -10,4 +11,40 @@ __C void llaisysSetContextRuntime(llaisysDeviceType_t device_type, int device_id // Llaisys API for getting the runtime APIs __C const LlaisysRuntimeAPI *llaisysGetRuntimeAPI(llaisysDeviceType_t device_type) { return llaisys::device::getRuntimeAPI(device_type); +} + +__C void llaisysInitDistributed(int rank, int world_size) { + llaisys::core::context().initDistributed(rank, world_size); +} + +__C void llaisysFinalizeDistributed() { + llaisys::core::context().finalizeDistributed(); +} + +__C uint8_t llaisysDistributedIsInitialized() { + return static_cast(llaisys::core::context().distributedInitialized()); +} + +__C int llaisysDistributedRank() { + return llaisys::core::context().distributedRank(); +} + +__C int llaisysDistributedWorldSize() { + return llaisys::core::context().distributedWorldSize(); +} + +__C void llaisysDistAllReduce(llaisysTensor_t tensor) { + llaisys::core::context().allReduce(tensor->tensor); +} + +__C llaisysTensor_t llaisysDistAllGather(llaisysTensor_t tensor) { + return new LlaisysTensor{llaisys::core::context().allGather(tensor->tensor)}; +} + +__C void llaisysDistBroadcast(llaisysTensor_t tensor, int root) { + llaisys::core::context().broadcast(tensor->tensor, root); +} + +__C void llaisysDistBarrier() { + llaisys::core::context().barrier(); } \ No newline at end of file diff --git a/src/models/qwen2/qwen2.cpp b/src/models/qwen2/qwen2.cpp new file mode 100644 index 000000000..4cacea842 --- /dev/null +++ b/src/models/qwen2/qwen2.cpp @@ -0,0 +1,288 @@ +#include "qwen2.hpp" +#include "../../utils.hpp" + +namespace llaisys::models { +namespace { + +std::pair evenShardRange(size_t total, size_t rank, size_t world_size) { + ASSERT(world_size > 0, "Tensor parallel world size must be positive."); + ASSERT(total % world_size == 0, "Tensor parallel shard requires evenly divisible dimension."); + const size_t shard = total / world_size; + return {rank * shard, (rank + 1) * shard}; +} + +tensor_t sliceColumns(const tensor_t &tensor, size_t start, size_t end) { + ASSERT(tensor->ndim() == 2, "sliceColumns expects a 2D tensor."); + return tensor->slice(1, start, end); +} + +tensor_t makeContiguous(const tensor_t &tensor) { + if (tensor->isContiguous()) { + return tensor; + } + auto out = Tensor::create(tensor->shape(), tensor->dtype(), tensor->deviceType(), tensor->deviceId()); + ops::rearrange(out, tensor); + return out; +} + +float readScalarAsFloat(const std::byte *data, llaisysDataType_t dtype, size_t index) { + switch (dtype) { + case LLAISYS_DTYPE_F32: + return reinterpret_cast(data)[index]; + case LLAISYS_DTYPE_F16: + return llaisys::utils::cast(reinterpret_cast(data)[index]); + case LLAISYS_DTYPE_BF16: + return llaisys::utils::cast(reinterpret_cast(data)[index]); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} + +} // namespace + +Qwen2Model::Qwen2Model(const Qwen2Config& config, llaisysDeviceType_t device_type, int device_id) + : config_(config), + device_type_(device_type), + device_id_(device_id), + tp_rank_(0), + tp_world_size_(1), + hidden_shard_size_(0), + kv_hidden_size_(0), + kv_shard_size_(0), + intermediate_shard_size_(0), + vocab_shard_size_(0), + vocab_start_(0), + vocab_end_(0), + cache_len_(0), + buffer_seq_len_(0) { + if (device_type_ == LLAISYS_DEVICE_NVIDIA && core::context().distributedInitialized()) { + tp_rank_ = static_cast(core::context().distributedRank()); + tp_world_size_ = static_cast(core::context().distributedWorldSize()); + } + + const size_t nl = config_.num_layers; + const size_t hs = config_.hidden_size; + kv_hidden_size_ = config_.num_kv_heads * config_.head_dim; + ASSERT(hs % tp_world_size_ == 0, "Tensor parallel hidden size must be divisible by world size."); + ASSERT(kv_hidden_size_ % tp_world_size_ == 0, "Tensor parallel KV size must be divisible by world size."); + ASSERT(config_.intermediate_size % tp_world_size_ == 0, "Tensor parallel intermediate size must be divisible by world size."); + ASSERT(config_.vocab_size % tp_world_size_ == 0, "Tensor parallel vocab size must be divisible by world size."); + hidden_shard_size_ = hs / tp_world_size_; + kv_shard_size_ = kv_hidden_size_ / tp_world_size_; + intermediate_shard_size_ = config_.intermediate_size / tp_world_size_; + vocab_shard_size_ = config_.vocab_size / tp_world_size_; + auto vocab_range = evenShardRange(config_.vocab_size, tp_rank_, tp_world_size_); + vocab_start_ = vocab_range.first; + vocab_end_ = vocab_range.second; + + // Allocate embedding weights + embed_tokens_ = Tensor::create({vocab_shard_size_, hs}, config_.dtype, device_type_, device_id_); + lm_head_ = Tensor::create({vocab_shard_size_, hs}, config_.dtype, device_type_, device_id_); + norm_weight_ = Tensor::create({hs}, config_.dtype, device_type_, device_id_); + + // Allocate per-layer weights + input_layernorm_weight_.resize(nl); + q_proj_weight_.resize(nl); + q_proj_bias_.resize(nl); + k_proj_weight_.resize(nl); + k_proj_bias_.resize(nl); + v_proj_weight_.resize(nl); + v_proj_bias_.resize(nl); + o_proj_weight_.resize(nl); + post_attn_layernorm_weight_.resize(nl); + gate_proj_weight_.resize(nl); + up_proj_weight_.resize(nl); + down_proj_weight_.resize(nl); + + for (size_t i = 0; i < nl; ++i) { + input_layernorm_weight_[i] = Tensor::create({hs}, config_.dtype, device_type_, device_id_); + q_proj_weight_[i] = Tensor::create({hidden_shard_size_, hs}, config_.dtype, device_type_, device_id_); + q_proj_bias_[i] = Tensor::create({hidden_shard_size_}, config_.dtype, device_type_, device_id_); + k_proj_weight_[i] = Tensor::create({kv_shard_size_, hs}, config_.dtype, device_type_, device_id_); + k_proj_bias_[i] = Tensor::create({kv_shard_size_}, config_.dtype, device_type_, device_id_); + v_proj_weight_[i] = Tensor::create({kv_shard_size_, hs}, config_.dtype, device_type_, device_id_); + v_proj_bias_[i] = Tensor::create({kv_shard_size_}, config_.dtype, device_type_, device_id_); + o_proj_weight_[i] = Tensor::create({hs, hidden_shard_size_}, config_.dtype, device_type_, device_id_); + post_attn_layernorm_weight_[i] = Tensor::create({hs}, config_.dtype, device_type_, device_id_); + gate_proj_weight_[i] = Tensor::create({intermediate_shard_size_, hs}, config_.dtype, device_type_, device_id_); + up_proj_weight_[i] = Tensor::create({intermediate_shard_size_, hs}, config_.dtype, device_type_, device_id_); + down_proj_weight_[i] = Tensor::create({hs, intermediate_shard_size_}, config_.dtype, device_type_, device_id_); + } + + // Allocate KV cache + allocateKVCache(); +} + +void Qwen2Model::allocateKVCache() { + const size_t nl = config_.num_layers; + const size_t maxseq = config_.max_seq_len; + + k_cache_.resize(nl); + v_cache_.resize(nl); + + for (size_t i = 0; i < nl; ++i) { + k_cache_[i] = Tensor::create({maxseq, kv_shard_size_}, config_.dtype, device_type_, device_id_); + v_cache_[i] = Tensor::create({maxseq, kv_shard_size_}, config_.dtype, device_type_, device_id_); + } +} + +void Qwen2Model::allocateBuffers(size_t seq_len) { + if (buffer_seq_len_ == seq_len && hidden_states_ != nullptr) { + return; + } + + const size_t hs = config_.hidden_size; + + hidden_states_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_); + normed_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_); + q_local_ = Tensor::create({seq_len, hidden_shard_size_}, config_.dtype, device_type_, device_id_); + k_local_ = Tensor::create({seq_len, kv_shard_size_}, config_.dtype, device_type_, device_id_); + v_local_ = Tensor::create({seq_len, kv_shard_size_}, config_.dtype, device_type_, device_id_); + attn_proj_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_); + gate_ = Tensor::create({seq_len, intermediate_shard_size_}, config_.dtype, device_type_, device_id_); + up_ = Tensor::create({seq_len, intermediate_shard_size_}, config_.dtype, device_type_, device_id_); + mlp_out_ = Tensor::create({seq_len, intermediate_shard_size_}, config_.dtype, device_type_, device_id_); + down_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_); + logits_local_ = Tensor::create({1, vocab_shard_size_}, config_.dtype, device_type_, device_id_); + pos_ids_ = Tensor::create({seq_len}, LLAISYS_DTYPE_I64, device_type_, device_id_); + input_ids_ = Tensor::create({seq_len}, LLAISYS_DTYPE_I64, device_type_, device_id_); + max_idx_ = Tensor::create({1}, LLAISYS_DTYPE_I64, device_type_, device_id_); + max_val_ = Tensor::create({1}, config_.dtype, device_type_, device_id_); + buffer_seq_len_ = seq_len; +} + +int64_t Qwen2Model::infer(const int64_t* token_ids, size_t num_tokens) { + const size_t nh = config_.num_heads; + const size_t dh = config_.head_dim; + const size_t nl = config_.num_layers; + const float eps = config_.rms_norm_eps; + const float theta = config_.rope_theta; + const float scale = 1.0f / std::sqrt(static_cast(dh)); + + // Allocate buffers for this sequence length + allocateBuffers(num_tokens); + + // Create input token tensor + input_ids_->load(token_ids); + + // Create position ids: [cache_len, cache_len+1, ..., cache_len+num_tokens-1] + std::vector pos_ids_data(num_tokens); + for (size_t i = 0; i < num_tokens; ++i) { + pos_ids_data[i] = static_cast(cache_len_ + i); + } + pos_ids_->load(pos_ids_data.data()); + + // Embedding lookup: hidden_states = embed_tokens[input_ids] + ops::parallelEmbedding(hidden_states_, input_ids_, embed_tokens_, vocab_start_, vocab_end_); + if (tp_world_size_ > 1) { + core::context().allReduce(hidden_states_); + } + + // Process each layer + for (size_t layer = 0; layer < nl; ++layer) { + // Input layernorm + ops::rms_norm(normed_, hidden_states_, input_layernorm_weight_[layer], eps); + + ops::columnParallelLinear(q_local_, normed_, q_proj_weight_[layer], q_proj_bias_[layer]); + ops::columnParallelLinear(k_local_, normed_, k_proj_weight_[layer], k_proj_bias_[layer]); + ops::columnParallelLinear(v_local_, normed_, v_proj_weight_[layer], v_proj_bias_[layer]); + + auto q_full_flat = ops::gatherLastDim(q_local_); + auto k_full_flat = ops::gatherLastDim(k_local_); + auto q_full = q_full_flat->view({num_tokens, nh, dh}); + auto k_full = k_full_flat->view({num_tokens, config_.num_kv_heads, dh}); + + ops::rope(q_full, q_full, pos_ids_, theta); + ops::rope(k_full, k_full, pos_ids_, theta); + + // Update KV cache + auto k_cache_slice = k_cache_[layer]->slice(0, cache_len_, cache_len_ + num_tokens); + auto v_cache_slice = v_cache_[layer]->slice(0, cache_len_, cache_len_ + num_tokens); + auto k_after_rope_flat = k_full->view({num_tokens, kv_hidden_size_}); + auto k_local_after_rope = makeContiguous( + sliceColumns(k_after_rope_flat, tp_rank_ * kv_shard_size_, (tp_rank_ + 1) * kv_shard_size_)); + ops::rearrange(k_cache_slice, k_local_after_rope); + ops::rearrange(v_cache_slice, v_local_); + + auto k_full_cache = ops::gatherLastDim(k_cache_[layer]->slice(0, 0, cache_len_ + num_tokens)); + auto v_full_cache = ops::gatherLastDim(v_cache_[layer]->slice(0, 0, cache_len_ + num_tokens)); + auto k_attn = k_full_cache->view({cache_len_ + num_tokens, config_.num_kv_heads, dh}); + auto v_attn = v_full_cache->view({cache_len_ + num_tokens, config_.num_kv_heads, dh}); + + // Self attention + auto attn_out = Tensor::create({num_tokens, nh, dh}, config_.dtype, device_type_, device_id_); + ops::self_attention(attn_out, q_full, k_attn, v_attn, scale); + + // Output projection + auto attn_out_flat = attn_out->view({num_tokens, nh * dh}); + auto attn_out_local = makeContiguous( + sliceColumns(attn_out_flat, tp_rank_ * hidden_shard_size_, (tp_rank_ + 1) * hidden_shard_size_)); + ops::rowParallelLinear(attn_proj_, attn_out_local, o_proj_weight_[layer]); + + // Residual connection: hidden_states = hidden_states + attn_proj + ops::add(hidden_states_, hidden_states_, attn_proj_); + + // Post attention layernorm + ops::rms_norm(normed_, hidden_states_, post_attn_layernorm_weight_[layer], eps); + + // MLP: gate_proj, up_proj, swiglu, down_proj + ops::columnParallelLinear(gate_, normed_, gate_proj_weight_[layer], nullptr); + ops::columnParallelLinear(up_, normed_, up_proj_weight_[layer], nullptr); + ops::swiglu(mlp_out_, gate_, up_); + ops::rowParallelLinear(down_, mlp_out_, down_proj_weight_[layer]); + + // Residual connection: hidden_states = hidden_states + down + ops::add(hidden_states_, hidden_states_, down_); + } + + // Final layer norm + ops::rms_norm(normed_, hidden_states_, norm_weight_, eps); + + // Get last token's hidden state for prediction + auto last_hidden = normed_->slice(0, num_tokens - 1, num_tokens); // [1, hs] + + auto& ctx = core::context(); + ctx.setDevice(device_type_, device_id_); + auto& runtime = ctx.runtime(); + + // LM head local shard + ops::columnParallelLinear(logits_local_, last_hidden, lm_head_, nullptr); + auto logits_local_1d = logits_local_->view({vocab_shard_size_}); + ops::argmax(max_idx_, max_val_, logits_local_1d); + + int64_t local_idx = 0; + runtime.api()->memcpy_sync(&local_idx, max_idx_->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H); + local_idx += static_cast(vocab_start_); + + int64_t next_token = local_idx; + if (tp_world_size_ > 1) { + auto local_global_idx = Tensor::create({1}, LLAISYS_DTYPE_I64, device_type_, device_id_); + local_global_idx->load(&local_idx); + auto gathered_values = ctx.allGather(max_val_); + auto gathered_indices = ctx.allGather(local_global_idx); + + std::vector gathered_value_bytes(gathered_values->numel() * gathered_values->elementSize()); + std::vector gathered_index_host(gathered_indices->numel()); + runtime.api()->memcpy_sync(gathered_value_bytes.data(), gathered_values->data(), gathered_value_bytes.size(), LLAISYS_MEMCPY_D2H); + runtime.api()->memcpy_sync(gathered_index_host.data(), gathered_indices->data(), + gathered_index_host.size() * sizeof(int64_t), LLAISYS_MEMCPY_D2H); + + float best_value = readScalarAsFloat(gathered_value_bytes.data(), config_.dtype, 0); + next_token = gathered_index_host[0]; + for (size_t rank = 1; rank < tp_world_size_; ++rank) { + const float value = readScalarAsFloat(gathered_value_bytes.data(), config_.dtype, rank); + const int64_t index = gathered_index_host[rank]; + if (value > best_value || (value == best_value && index < next_token)) { + best_value = value; + next_token = index; + } + } + } + + // Update cache length + cache_len_ += num_tokens; + + return next_token; +} + +} // namespace llaisys::models diff --git a/src/models/qwen2/qwen2.hpp b/src/models/qwen2/qwen2.hpp new file mode 100644 index 000000000..a27ca690d --- /dev/null +++ b/src/models/qwen2/qwen2.hpp @@ -0,0 +1,124 @@ +#pragma once + +#include "../../tensor/tensor.hpp" +#include "../../ops/add/op.hpp" +#include "../../ops/argmax/op.hpp" +#include "../../ops/embedding/op.hpp" +#include "../../ops/linear/op.hpp" +#include "../../ops/rearrange/op.hpp" +#include "../../ops/rms_norm/op.hpp" +#include "../../ops/rope/op.hpp" +#include "../../ops/self_attention/op.hpp" +#include "../../ops/swiglu/op.hpp" + +#include +#include + +namespace llaisys::models { + +struct Qwen2Config { + llaisysDataType_t dtype; + size_t num_layers; // nlayer: 28 + size_t hidden_size; // hs: 1536 + size_t num_heads; // nh: 12 + size_t num_kv_heads; // nkvh: 2 + size_t head_dim; // dh: 128 (hidden_size / num_heads) + size_t intermediate_size; // di: 8960 + size_t max_seq_len; // maxseq: 131072 + size_t vocab_size; // voc: 151936 + float rms_norm_eps; // epsilon: 1e-6 + float rope_theta; // theta: 10000 + int64_t eos_token_id; // end_token: 151643 +}; + +class Qwen2Model { +private: + Qwen2Config config_; + llaisysDeviceType_t device_type_; + int device_id_; + size_t tp_rank_; + size_t tp_world_size_; + size_t hidden_shard_size_; + size_t kv_hidden_size_; + size_t kv_shard_size_; + size_t intermediate_shard_size_; + size_t vocab_shard_size_; + size_t vocab_start_; + size_t vocab_end_; + + // Weights + tensor_t embed_tokens_; // [vocab_shard_size, hidden_size] + tensor_t lm_head_; // [vocab_shard_size, hidden_size] + tensor_t norm_weight_; // [hidden_size] + + // Per-layer weights + std::vector input_layernorm_weight_; // [hidden_size] + std::vector q_proj_weight_; // [hidden_shard_size, hidden_size] + std::vector q_proj_bias_; // [hidden_shard_size] + std::vector k_proj_weight_; // [kv_shard_size, hidden_size] + std::vector k_proj_bias_; // [kv_shard_size] + std::vector v_proj_weight_; // [kv_shard_size, hidden_size] + std::vector v_proj_bias_; // [kv_shard_size] + std::vector o_proj_weight_; // [hidden_size, hidden_shard_size] + std::vector post_attn_layernorm_weight_; // [hidden_size] + std::vector gate_proj_weight_; // [intermediate_shard_size, hidden_size] + std::vector up_proj_weight_; // [intermediate_shard_size, hidden_size] + std::vector down_proj_weight_; // [hidden_size, intermediate_shard_size] + + // KV Cache: local shard per rank [num_layers][max_seq_len, kv_shard_size] + std::vector k_cache_; + std::vector v_cache_; + size_t cache_len_; // Current cached sequence length + + // Intermediate buffers + tensor_t hidden_states_; // [seq_len, hidden_size] + tensor_t normed_; // [seq_len, hidden_size] + tensor_t q_local_; // [seq_len, hidden_shard_size] + tensor_t k_local_; // [seq_len, kv_shard_size] + tensor_t v_local_; // [seq_len, kv_shard_size] + tensor_t attn_proj_; // [seq_len, hidden_size] + tensor_t gate_; // [seq_len, intermediate_shard_size] + tensor_t up_; // [seq_len, intermediate_shard_size] + tensor_t mlp_out_; // [seq_len, intermediate_shard_size] + tensor_t down_; // [seq_len, hidden_size] + tensor_t logits_local_; // [1, vocab_shard_size] + tensor_t pos_ids_; // [seq_len] + tensor_t input_ids_; // [seq_len] + tensor_t max_idx_; // [1] + tensor_t max_val_; // [1] + size_t buffer_seq_len_; + + void allocateBuffers(size_t seq_len); + void allocateKVCache(); + +public: + Qwen2Model(const Qwen2Config& config, llaisysDeviceType_t device_type, int device_id); + ~Qwen2Model() = default; + + // Weight accessors for loading + tensor_t& embedTokens() { return embed_tokens_; } + tensor_t& lmHead() { return lm_head_; } + tensor_t& normWeight() { return norm_weight_; } + tensor_t& inputLayernormWeight(size_t layer) { return input_layernorm_weight_[layer]; } + tensor_t& qProjWeight(size_t layer) { return q_proj_weight_[layer]; } + tensor_t& qProjBias(size_t layer) { return q_proj_bias_[layer]; } + tensor_t& kProjWeight(size_t layer) { return k_proj_weight_[layer]; } + tensor_t& kProjBias(size_t layer) { return k_proj_bias_[layer]; } + tensor_t& vProjWeight(size_t layer) { return v_proj_weight_[layer]; } + tensor_t& vProjBias(size_t layer) { return v_proj_bias_[layer]; } + tensor_t& oProjWeight(size_t layer) { return o_proj_weight_[layer]; } + tensor_t& postAttnLayernormWeight(size_t layer) { return post_attn_layernorm_weight_[layer]; } + tensor_t& gateProjWeight(size_t layer) { return gate_proj_weight_[layer]; } + tensor_t& upProjWeight(size_t layer) { return up_proj_weight_[layer]; } + tensor_t& downProjWeight(size_t layer) { return down_proj_weight_[layer]; } + + // Inference + int64_t infer(const int64_t* token_ids, size_t num_tokens); + + // Reset KV cache + void resetCache() { cache_len_ = 0; } + + const Qwen2Config& config() const { return config_; } +}; + +} // namespace llaisys::models diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu new file mode 100644 index 000000000..6ebaea621 --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.cu @@ -0,0 +1,61 @@ +#include "add_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void addKernel(T *c, const T *a, const T *b, size_t numel) { + const size_t stride = static_cast(blockDim.x) * gridDim.x; + for (size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += stride) { + const float value = llaisys::device::nvidia::scalarToFloat(a[idx]) + + llaisys::device::nvidia::scalarToFloat(b[idx]); + c[idx] = llaisys::device::nvidia::floatToScalar(value); + } +} + +} // namespace + +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel, llaisysStream_t stream) { + const int max_blocks = 4096; + const int threads = 256; + const int blocks = static_cast(std::min((numel + threads - 1) / threads, static_cast(max_blocks))); + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + if (blocks == 0) { + return; + } + + switch (type) { + case LLAISYS_DTYPE_F32: + addKernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + case LLAISYS_DTYPE_F16: + addKernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + case LLAISYS_DTYPE_BF16: + addKernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/add/nvidia/add_nvidia.cuh b/src/ops/add/nvidia/add_nvidia.cuh new file mode 100644 index 000000000..2ed37ba55 --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.cuh @@ -0,0 +1,18 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel, llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t size); +} diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp index a057330d7..39dc69698 100644 --- a/src/ops/add/op.cpp +++ b/src/ops/add/op.cpp @@ -4,6 +4,9 @@ #include "../../utils.hpp" #include "cpu/add_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/add_nvidia.cuh" +#endif namespace llaisys::ops { void add(tensor_t c, tensor_t a, tensor_t b) { @@ -25,8 +28,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) { return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel(), llaisys::core::context().runtime().stream()); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp new file mode 100644 index 000000000..942273f74 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.cpp @@ -0,0 +1,45 @@ +#include "argmax_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include + +template +void argmax_(N *max_idx, T *max_val, const T *vals, size_t numel) { + max_idx[0] = 0; + max_val[0] = vals[0]; + if constexpr (std::is_same_v || std::is_same_v) { + for (size_t i = 1; i < numel; i++) { + if (llaisys::utils::cast(vals[i]) > llaisys::utils::cast(max_val[0])) { + max_val[0] = vals[i]; + max_idx[0] = static_cast(i); + } + } + } else { + for (size_t i = 1; i < numel; i++) { + if (vals[i] > max_val[0]) { + max_val[0] = vals[i]; + max_idx[0] = static_cast(i); + } + } + } +} + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return argmax_(reinterpret_cast(max_idx), reinterpret_cast(max_val), reinterpret_cast(vals), numel); + case LLAISYS_DTYPE_BF16: + return argmax_(reinterpret_cast(max_idx), reinterpret_cast(max_val), + reinterpret_cast(vals), numel); + case LLAISYS_DTYPE_F16: + return argmax_(reinterpret_cast(max_idx), reinterpret_cast(max_val), + reinterpret_cast(vals), numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp new file mode 100644 index 000000000..891a6d20a --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size); +} diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu new file mode 100644 index 000000000..7e5e5172d --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.cu @@ -0,0 +1,89 @@ +#include "argmax_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void argmaxKernel(int64_t *max_idx, T *max_val, const T *vals, size_t numel) { + __shared__ float shared_vals[256]; + __shared__ int64_t shared_idx[256]; + + const int tid = threadIdx.x; + float thread_max = -std::numeric_limits::infinity(); + int64_t thread_idx = 0; + + for (size_t idx = static_cast(tid); idx < numel; idx += blockDim.x) { + const float value = llaisys::device::nvidia::scalarToFloat(vals[idx]); + if (value > thread_max) { + thread_max = value; + thread_idx = static_cast(idx); + } + } + + shared_vals[tid] = thread_max; + shared_idx[tid] = thread_idx; + __syncthreads(); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (tid < offset) { + const float other_val = shared_vals[tid + offset]; + const int64_t other_idx = shared_idx[tid + offset]; + if (other_val > shared_vals[tid]) { + shared_vals[tid] = other_val; + shared_idx[tid] = other_idx; + } + } + __syncthreads(); + } + + if (tid == 0) { + max_idx[0] = shared_idx[0]; + max_val[0] = llaisys::device::nvidia::floatToScalar(shared_vals[0]); + } +} + +} // namespace + +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel, llaisysStream_t stream) { + if (numel == 0) { + return; + } + + constexpr int threads = 256; + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + auto *max_idx_ptr = reinterpret_cast(max_idx); + + switch (type) { + case LLAISYS_DTYPE_F32: + argmaxKernel<<<1, threads, 0, cuda_stream>>>( + max_idx_ptr, + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + break; + case LLAISYS_DTYPE_F16: + argmaxKernel<<<1, threads, 0, cuda_stream>>>( + max_idx_ptr, + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + break; + case LLAISYS_DTYPE_BF16: + argmaxKernel<<<1, threads, 0, cuda_stream>>>( + max_idx_ptr, + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cuh b/src/ops/argmax/nvidia/argmax_nvidia.cuh new file mode 100644 index 000000000..d90c10555 --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.cuh @@ -0,0 +1,18 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel, llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size); +} diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d426..89c1d46fb 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,37 @@ #include "op.hpp" +#include "../../utils.hpp" +#include "llaisys.h" + +#include "cpu/argmax_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/argmax_nvidia.cuh" +#endif + namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + // TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(max_idx, max_val, vals); + CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype()); + ASSERT(max_idx->dtype() == llaisysDataType_t::LLAISYS_DTYPE_I64, "Argmax: the data type of max_idx must be LLAISYS_DTYPE_I64."); + ASSERT(max_idx->isContiguous() && max_val->isContiguous() && vals->isContiguous(), "Argmax: all tensors must be contiguout."); + + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); + } + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel(), + llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp new file mode 100644 index 000000000..531eb279e --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.cpp @@ -0,0 +1,32 @@ +#include "embedding_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include +#include + +template +void embedding_(T *out, const I *index, const T *weight, size_t num_indices, size_t hidden_size) { + for (size_t i = 0; i < num_indices; i++) { + memcpy(out + i * hidden_size, weight + index[i] * hidden_size, hidden_size * sizeof(T)); + } +} + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, size_t hidden_size) { + switch (type) { + case LLAISYS_DTYPE_F32: + return embedding_(reinterpret_cast(out), reinterpret_cast(index), reinterpret_cast(weight), num_indices, hidden_size); + case LLAISYS_DTYPE_BF16: + return embedding_(reinterpret_cast(out), reinterpret_cast(index), + reinterpret_cast(weight), num_indices, hidden_size); + case LLAISYS_DTYPE_F16: + return embedding_(reinterpret_cast(out), reinterpret_cast(index), + reinterpret_cast(weight), num_indices, hidden_size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp new file mode 100644 index 000000000..160bac617 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, size_t hidden_size); +} \ No newline at end of file diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu new file mode 100644 index 000000000..4e91d6772 --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.cu @@ -0,0 +1,140 @@ +#include "embedding_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void embeddingKernel(T *out, const int64_t *index, const T *weight, size_t num_indices, size_t hidden_size) { + const size_t total = num_indices * hidden_size; + const size_t stride = static_cast(blockDim.x) * gridDim.x; + for (size_t linear_idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < total; linear_idx += stride) { + const size_t row = linear_idx / hidden_size; + const size_t col = linear_idx % hidden_size; + const int64_t vocab_row = index[row]; + out[linear_idx] = weight[static_cast(vocab_row) * hidden_size + col]; + } +} + +template +__global__ void parallelEmbeddingKernel(T *out, const int64_t *index, const T *weight, size_t num_indices, size_t hidden_size, + size_t vocab_start, size_t vocab_end) { + const size_t total = num_indices * hidden_size; + const size_t stride = static_cast(blockDim.x) * gridDim.x; + const T zero = llaisys::device::nvidia::floatToScalar(0.0f); + for (size_t linear_idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < total; linear_idx += stride) { + const size_t row = linear_idx / hidden_size; + const size_t col = linear_idx % hidden_size; + const int64_t vocab_row = index[row]; + if (vocab_row >= static_cast(vocab_start) && vocab_row < static_cast(vocab_end)) { + const size_t local_row = static_cast(vocab_row) - vocab_start; + out[linear_idx] = weight[local_row * hidden_size + col]; + } else { + out[linear_idx] = zero; + } + } +} + +} // namespace + +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, + size_t hidden_size, llaisysStream_t stream) { + const size_t total = num_indices * hidden_size; + if (total == 0) { + return; + } + + constexpr int threads = 256; + const int max_blocks = 4096; + const int blocks = static_cast(std::min((total + threads - 1) / threads, static_cast(max_blocks))); + const auto *index_ptr = reinterpret_cast(index); + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + switch (type) { + case LLAISYS_DTYPE_F32: + embeddingKernel<<>>( + reinterpret_cast(out), + index_ptr, + reinterpret_cast(weight), + num_indices, + hidden_size); + break; + case LLAISYS_DTYPE_F16: + embeddingKernel<<>>( + reinterpret_cast(out), + index_ptr, + reinterpret_cast(weight), + num_indices, + hidden_size); + break; + case LLAISYS_DTYPE_BF16: + embeddingKernel<<>>( + reinterpret_cast(out), + index_ptr, + reinterpret_cast(weight), + num_indices, + hidden_size); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +void parallelEmbedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, + size_t hidden_size, size_t vocab_start, size_t vocab_end, llaisysStream_t stream) { + const size_t total = num_indices * hidden_size; + if (total == 0) { + return; + } + + constexpr int threads = 256; + const int max_blocks = 4096; + const int blocks = static_cast(std::min((total + threads - 1) / threads, static_cast(max_blocks))); + const auto *index_ptr = reinterpret_cast(index); + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + switch (type) { + case LLAISYS_DTYPE_F32: + parallelEmbeddingKernel<<>>( + reinterpret_cast(out), + index_ptr, + reinterpret_cast(weight), + num_indices, + hidden_size, + vocab_start, + vocab_end); + break; + case LLAISYS_DTYPE_F16: + parallelEmbeddingKernel<<>>( + reinterpret_cast(out), + index_ptr, + reinterpret_cast(weight), + num_indices, + hidden_size, + vocab_start, + vocab_end); + break; + case LLAISYS_DTYPE_BF16: + parallelEmbeddingKernel<<>>( + reinterpret_cast(out), + index_ptr, + reinterpret_cast(weight), + num_indices, + hidden_size, + vocab_start, + vocab_end); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cuh b/src/ops/embedding/nvidia/embedding_nvidia.cuh new file mode 100644 index 000000000..a7d260c1a --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.cuh @@ -0,0 +1,12 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, + size_t hidden_size, llaisysStream_t stream); +void parallelEmbedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, + size_t hidden_size, size_t vocab_start, size_t vocab_end, llaisysStream_t stream); +} diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d06..46cf57430 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,107 @@ #include "op.hpp" +#include "cpu/embedding_cpu.hpp" +#include "llaisys.h" +#include "../../utils.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/embedding_nvidia.cuh" +#endif + namespace llaisys::ops { + +namespace { + +template +void parallelEmbeddingCpuImpl(std::byte *out, const std::byte *index, const std::byte *weight, size_t num_indices, size_t hidden_size, + size_t vocab_start, size_t vocab_end) { + auto *out_ptr = reinterpret_cast(out); + const auto *index_ptr = reinterpret_cast(index); + const auto *weight_ptr = reinterpret_cast(weight); + for (size_t row = 0; row < num_indices; ++row) { + const int64_t vocab_row = index_ptr[row]; + for (size_t col = 0; col < hidden_size; ++col) { + if (vocab_row >= static_cast(vocab_start) && vocab_row < static_cast(vocab_end)) { + const size_t local_row = static_cast(vocab_row) - vocab_start; + out_ptr[row * hidden_size + col] = weight_ptr[local_row * hidden_size + col]; + } else { + out_ptr[row * hidden_size + col] = llaisys::utils::cast(0.0f); + } + } + } +} + +void parallelEmbeddingCpu(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, + size_t hidden_size, size_t vocab_start, size_t vocab_end) { + switch (type) { + case LLAISYS_DTYPE_F32: + return parallelEmbeddingCpuImpl(out, index, weight, num_indices, hidden_size, vocab_start, vocab_end); + case LLAISYS_DTYPE_F16: + return parallelEmbeddingCpuImpl(out, index, weight, num_indices, hidden_size, vocab_start, vocab_end); + case LLAISYS_DTYPE_BF16: + return parallelEmbeddingCpuImpl(out, index, weight, num_indices, hidden_size, vocab_start, vocab_end); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace + void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); + // TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, index, weight); + CHECK_SAME_DTYPE(out->dtype(), weight->dtype()); + ASSERT(out->shape()[1] == weight->shape()[1], ""); + ASSERT(out->shape()[0] == index->shape()[0], ""); + ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), ""); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->shape()[0], weight->shape()[1]); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->shape()[0], weight->shape()[1]); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->shape()[0], weight->shape()[1], + llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} + +void parallelEmbedding(tensor_t out, tensor_t index, tensor_t weight_local, size_t vocab_start, size_t vocab_end) { + CHECK_SAME_DEVICE(out, index, weight_local); + CHECK_SAME_DTYPE(out->dtype(), weight_local->dtype()); + ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "ParallelEmbedding: index must be int64."); + ASSERT(out->ndim() == 2, "ParallelEmbedding: out must be [num_tokens, hidden_size]."); + ASSERT(index->ndim() == 1, "ParallelEmbedding: index must be [num_tokens]."); + ASSERT(weight_local->ndim() == 2, "ParallelEmbedding: weight must be [local_vocab, hidden_size]."); + ASSERT(out->shape()[0] == index->shape()[0], "ParallelEmbedding: num_tokens mismatch."); + ASSERT(out->shape()[1] == weight_local->shape()[1], "ParallelEmbedding: hidden size mismatch."); + ASSERT(out->isContiguous() && index->isContiguous() && weight_local->isContiguous(), + "ParallelEmbedding: all tensors must be contiguous."); + ASSERT(vocab_start <= vocab_end, "ParallelEmbedding: invalid vocab shard range."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return parallelEmbeddingCpu(out->data(), index->data(), weight_local->data(), out->dtype(), out->shape()[0], out->shape()[1], vocab_start, vocab_end); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return parallelEmbeddingCpu(out->data(), index->data(), weight_local->data(), out->dtype(), out->shape()[0], out->shape()[1], vocab_start, vocab_end); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::parallelEmbedding(out->data(), index->data(), weight_local->data(), out->dtype(), out->shape()[0], out->shape()[1], + vocab_start, vocab_end, llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/embedding/op.hpp b/src/ops/embedding/op.hpp index 37216c0cf..d2acd3023 100644 --- a/src/ops/embedding/op.hpp +++ b/src/ops/embedding/op.hpp @@ -4,4 +4,5 @@ namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight); +void parallelEmbedding(tensor_t out, tensor_t index, tensor_t weight_local, size_t vocab_start, size_t vocab_end); } diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp new file mode 100644 index 000000000..5174d180e --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.cpp @@ -0,0 +1,37 @@ +#include "linear_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +void linear_(T *out, const T *in, const T *weight, const T *bias, std::vector out_shape, std::vector in_shape, std::vector weight_shape, std::vector bias_shape) { + for (size_t i = 0; i < out_shape[0]; i++) { + for (size_t j = 0; j < out_shape[1]; j++) { + float val = 0.0F; // 使用 float 累加,避免 f16/bf16 精度损失 + for (size_t k = 0; k < in_shape[1]; k++) { + val += llaisys::utils::cast(in[i * in_shape[1] + k]) + * llaisys::utils::cast(weight[j * weight_shape[1] + k]); + } + if (bias) { + val += llaisys::utils::cast(bias[j]); + } + out[i * out_shape[1] + j] = llaisys::utils::cast(val); + } + } +} + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, std::vector out_shape, std::vector in_shape, std::vector weight_shape, std::vector bias_shape) { + switch (type) { + case LLAISYS_DTYPE_F32: + return linear_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight), reinterpret_cast(bias), out_shape, in_shape, weight_shape, bias_shape); + case LLAISYS_DTYPE_BF16: + return linear_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight), reinterpret_cast(bias), out_shape, in_shape, weight_shape, bias_shape); + case LLAISYS_DTYPE_F16: + return linear_(reinterpret_cast(out), reinterpret_cast(in), reinterpret_cast(weight), reinterpret_cast(bias), out_shape, in_shape, weight_shape, bias_shape); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp new file mode 100644 index 000000000..107dccc81 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, std::vector out_shape, std::vector in_shape, std::vector weight_shape, std::vector bias_shape); +} \ No newline at end of file diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu new file mode 100644 index 000000000..c707e4ecd --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.cu @@ -0,0 +1,114 @@ +#include "linear_nvidia.cuh" + +#include "../../../core/llaisys_core.hpp" +#include "../../../device/nvidia/cuda_utils.cuh" +#include "../../../device/nvidia/nvidia_resource.cuh" + +#include +#include + +namespace llaisys::ops::nvidia { +namespace { + +#define CUBLAS_CHECK(EXPR__) \ + do { \ + cublasStatus_t status__ = (EXPR__); \ + if (status__ != CUBLAS_STATUS_SUCCESS) { \ + throw std::runtime_error("cuBLAS call failed"); \ + } \ + } while (0) + +inline cudaDataType_t toCudaDataType(llaisysDataType_t type) { + switch (type) { + case LLAISYS_DTYPE_F32: + return CUDA_R_32F; + case LLAISYS_DTYPE_F16: + return CUDA_R_16F; + case LLAISYS_DTYPE_BF16: + return CUDA_R_16BF; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +template +__global__ void addBiasKernel(T *out, const T *bias, size_t numel, size_t cols) { + const size_t stride = static_cast(blockDim.x) * gridDim.x; + for (size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += stride) { + const size_t col = idx % cols; + const float value = llaisys::device::nvidia::scalarToFloat(out[idx]) + + llaisys::device::nvidia::scalarToFloat(bias[col]); + out[idx] = llaisys::device::nvidia::floatToScalar(value); + } +} + +template +void launchBias(std::byte *out, const std::byte *bias, size_t numel, size_t cols, cudaStream_t stream) { + constexpr int threads = 256; + const int max_blocks = 4096; + const int blocks = static_cast(std::min((numel + threads - 1) / threads, static_cast(max_blocks))); + addBiasKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(bias), + numel, + cols); + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace + +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, + size_t m, size_t n, size_t k, llaisysStream_t stream) { + if (m == 0 || n == 0 || k == 0) { + return; + } + + auto &runtime = llaisys::core::context().runtime(); + cublasHandle_t handle = llaisys::device::nvidia::resource(runtime.deviceId()).cublasHandle(); + CUBLAS_CHECK(cublasSetStream(handle, llaisys::device::nvidia::toCudaStream(stream))); + + const float alpha = 1.0f; + const float beta = 0.0f; + const cudaDataType_t data_type = toCudaDataType(type); + + CUBLAS_CHECK(cublasGemmEx( + handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + static_cast(n), + static_cast(m), + static_cast(k), + &alpha, + weight, + data_type, + static_cast(k), + in, + data_type, + static_cast(k), + &beta, + out, + data_type, + static_cast(n), + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT)); + + if (bias != nullptr) { + const size_t numel = m * n; + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + switch (type) { + case LLAISYS_DTYPE_F32: + launchBias(out, bias, numel, n, cuda_stream); + break; + case LLAISYS_DTYPE_F16: + launchBias(out, bias, numel, n, cuda_stream); + break; + case LLAISYS_DTYPE_BF16: + launchBias(out, bias, numel, n, cuda_stream); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/linear/nvidia/linear_nvidia.cuh b/src/ops/linear/nvidia/linear_nvidia.cuh new file mode 100644 index 000000000..8da88d895 --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.cuh @@ -0,0 +1,20 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, + size_t m, size_t n, size_t k, llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, std::vector out_shape, std::vector in_shape, std::vector weight_shape, std::vector bias_shape); +} diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f8655..3ef71e4c8 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,94 @@ #include "op.hpp" +#include "cpu/linear_cpu.hpp" +#include "../rearrange/op.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/linear_nvidia.cuh" +#endif + namespace llaisys::ops { + +tensor_t gatherLastDim(const tensor_t &local) { + ASSERT(local != nullptr, "gatherLastDim input must not be null."); + ASSERT(local->isContiguous(), "gatherLastDim requires a contiguous tensor."); + + auto &ctx = llaisys::core::context(); + if (!ctx.distributedInitialized() || ctx.distributedWorldSize() == 1) { + return local; + } + + auto gathered = ctx.allGather(local); + const size_t local_ndim = local->ndim(); + ASSERT(local_ndim >= 1, "gatherLastDim requires tensor rank >= 1."); + + std::vector order; + order.reserve(local_ndim + 1); + for (size_t dim = 1; dim < local_ndim; ++dim) { + order.push_back(dim); + } + order.push_back(0); + order.push_back(local_ndim); + + auto permuted = gathered->permute(order); + auto merged = llaisys::Tensor::create(permuted->shape(), permuted->dtype(), permuted->deviceType(), permuted->deviceId()); + ops::rearrange(merged, permuted); + + std::vector final_shape = local->shape(); + final_shape.back() *= static_cast(ctx.distributedWorldSize()); + return merged->view(final_shape); +} + +void columnParallelLinear(tensor_t out_local, tensor_t in, tensor_t weight_local, tensor_t bias_local) { + linear(out_local, in, weight_local, bias_local); +} + +void rowParallelLinear(tensor_t out, tensor_t in_local, tensor_t weight_local) { + auto &ctx = llaisys::core::context(); + linear(out, in_local, weight_local, nullptr); + if (ctx.distributedInitialized() && ctx.distributedWorldSize() > 1) { + ctx.allReduce(out); + } +} + void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + // TO_BE_IMPLEMENTED(); + if (bias == nullptr) { + CHECK_SAME_DEVICE(out, in, weight); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), "Linear: all tensors must be contiguous."); + } else { + CHECK_SAME_DEVICE(out, in, weight, bias); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype(), bias->dtype()); + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous() && bias->isContiguous(), "Linear: all tensors must be contiguous."); + ASSERT(bias->shape()[0] == weight->shape()[0], "Linear: bias.shape[0] must be equal to weight.shape[0]."); + } + ASSERT(out->shape()[0] == in->shape()[0], "Linear: out.shape[0] must be equal to in.shape[0]."); + ASSERT(out->shape()[1] == weight->shape()[0], "Linear: out.shape[1] must be equal to weight.shape[0]."); + ASSERT(in->shape()[1] == weight->shape()[1], "Linear: in.shape[1] must be equal to weight.shape[1]."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + if (bias == nullptr) { + return cpu::linear(out->data(), in->data(), weight->data(), nullptr, out->dtype(), out->shape(), in->shape(), weight->shape(), {}); + } + return cpu::linear(out->data(), in->data(), weight->data(), bias->data(), out->dtype(), out->shape(), in->shape(), weight->shape(), bias->shape()); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + if (bias == nullptr) { + return cpu::linear(out->data(), in->data(), weight->data(), nullptr, out->dtype(), out->shape(), in->shape(), weight->shape(), {}); + } else { + return cpu::linear(out->data(), in->data(), weight->data(), bias->data(), out->dtype(), out->shape(), in->shape(), weight->shape(), bias->shape()); + } +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::linear(out->data(), in->data(), weight->data(), bias == nullptr ? nullptr : bias->data(), out->dtype(), + out->shape()[0], out->shape()[1], in->shape()[1], llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/linear/op.hpp b/src/ops/linear/op.hpp index 7bf06f017..75af49ac5 100644 --- a/src/ops/linear/op.hpp +++ b/src/ops/linear/op.hpp @@ -4,4 +4,7 @@ namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias); +void columnParallelLinear(tensor_t out_local, tensor_t in, tensor_t weight_local, tensor_t bias_local); +void rowParallelLinear(tensor_t out, tensor_t in_local, tensor_t weight_local); +tensor_t gatherLastDim(const tensor_t &local); } diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp new file mode 100644 index 000000000..b34b00b54 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp @@ -0,0 +1,78 @@ +#include "rearrange_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +void rearrange_(T *out, const T *in, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides) { + size_t ndim = shape.size(); + + if (ndim == 0) { + out[0] = in[0]; + return; + } + + // Calculate total number of elements + size_t numel = 1; + for (size_t i = 0; i < ndim; ++i) { + numel *= shape[i]; + } + + // Use indices to iterate through all elements + std::vector indices(ndim, 0); + + for (size_t i = 0; i < numel; ++i) { + // Calculate source and destination offsets + ptrdiff_t out_offset = 0; + ptrdiff_t in_offset = 0; + for (size_t d = 0; d < ndim; ++d) { + out_offset += indices[d] * out_strides[d]; + in_offset += indices[d] * in_strides[d]; + } + + // Copy element + out[out_offset] = in[in_offset]; + + // Increment indices (like counting in mixed radix) + for (ptrdiff_t d = ndim - 1; d >= 0; --d) { + indices[d]++; + if (indices[d] < shape[d]) { + break; + } + indices[d] = 0; + } + } +} + +namespace llaisys::ops::cpu { +void rearrange(std::byte *out, const std::byte *in, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides, + llaisysDataType_t type) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_BF16: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_F16: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + case LLAISYS_DTYPE_I64: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp new file mode 100644 index 000000000..d9fafb7b0 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp @@ -0,0 +1,13 @@ +#pragma once +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::cpu { +void rearrange(std::byte *out, const std::byte *in, + const std::vector &shape, + const std::vector &out_strides, + const std::vector &in_strides, + llaisysDataType_t type); +} diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/ops/rearrange/nvidia/rearrange_nvidia.cu new file mode 100644 index 000000000..49180db76 --- /dev/null +++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cu @@ -0,0 +1,95 @@ +#include "rearrange_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void rearrangeKernel(T *out, const T *in, size_t numel, size_t ndim, const size_t *shape, const ptrdiff_t *out_strides, + const ptrdiff_t *in_strides) { + const size_t stride = static_cast(blockDim.x) * gridDim.x; + for (size_t linear_idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < numel; linear_idx += stride) { + size_t remaining = linear_idx; + ptrdiff_t out_offset = 0; + ptrdiff_t in_offset = 0; + for (ptrdiff_t dim = static_cast(ndim) - 1; dim >= 0; --dim) { + const size_t coord = remaining % shape[dim]; + remaining /= shape[dim]; + out_offset += static_cast(coord) * out_strides[dim]; + in_offset += static_cast(coord) * in_strides[dim]; + } + out[out_offset] = in[in_offset]; + } +} + +template +void launchRearrange(std::byte *out, const std::byte *in, size_t numel, size_t ndim, const size_t *shape_dev, + const ptrdiff_t *out_strides_dev, const ptrdiff_t *in_strides_dev, cudaStream_t stream) { + constexpr int threads = 256; + const int max_blocks = 4096; + const int blocks = static_cast(std::min((numel + threads - 1) / threads, static_cast(max_blocks))); + rearrangeKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + numel, + ndim, + shape_dev, + out_strides_dev, + in_strides_dev); + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace + +void rearrange(std::byte *out, const std::byte *in, const std::vector &shape, const std::vector &out_strides, + const std::vector &in_strides, llaisysDataType_t type, llaisysStream_t stream) { + size_t numel = 1; + for (size_t dim : shape) { + numel *= dim; + } + if (numel == 0) { + return; + } + + const size_t ndim = shape.size(); + auto cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + size_t *shape_dev = nullptr; + ptrdiff_t *out_strides_dev = nullptr; + ptrdiff_t *in_strides_dev = nullptr; + CUDA_CHECK(cudaMalloc(&shape_dev, ndim * sizeof(size_t))); + CUDA_CHECK(cudaMalloc(&out_strides_dev, ndim * sizeof(ptrdiff_t))); + CUDA_CHECK(cudaMalloc(&in_strides_dev, ndim * sizeof(ptrdiff_t))); + CUDA_CHECK(cudaMemcpyAsync(shape_dev, shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, cuda_stream)); + CUDA_CHECK(cudaMemcpyAsync(out_strides_dev, out_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, cuda_stream)); + CUDA_CHECK(cudaMemcpyAsync(in_strides_dev, in_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, cuda_stream)); + + switch (type) { + case LLAISYS_DTYPE_F32: + launchRearrange(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream); + break; + case LLAISYS_DTYPE_F16: + launchRearrange(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream); + break; + case LLAISYS_DTYPE_BF16: + launchRearrange(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream); + break; + case LLAISYS_DTYPE_I64: + launchRearrange(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream); + break; + default: + CUDA_CHECK(cudaFree(shape_dev)); + CUDA_CHECK(cudaFree(out_strides_dev)); + CUDA_CHECK(cudaFree(in_strides_dev)); + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaFree(shape_dev)); + CUDA_CHECK(cudaFree(out_strides_dev)); + CUDA_CHECK(cudaFree(in_strides_dev)); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cuh b/src/ops/rearrange/nvidia/rearrange_nvidia.cuh new file mode 100644 index 000000000..213623a00 --- /dev/null +++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cuh @@ -0,0 +1,21 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void rearrange(std::byte *out, const std::byte *in, const std::vector &shape, const std::vector &out_strides, + const std::vector &in_strides, llaisysDataType_t type, llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void rearrange(std::byte *out, const std::byte *in, const std::vector &shape, const std::vector &out_strides, const std::vector &in_strides, llaisysDataType_t type); +} diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp index 017a6ae59..5b0d28e11 100644 --- a/src/ops/rearrange/op.cpp +++ b/src/ops/rearrange/op.cpp @@ -1,7 +1,47 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rearrange_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rearrange_nvidia.cuh" +#endif + namespace llaisys::ops { void rearrange(tensor_t out, tensor_t in) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in); + CHECK_SAME_SHAPE(out->shape(), in->shape()); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rearrange(out->data(), in->data(), + out->shape(), out->strides(), in->strides(), + out->dtype()); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rearrange(out->data(), in->data(), + out->shape(), out->strides(), in->strides(), + out->dtype()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + if (out->isContiguous() && in->isContiguous()) { + return llaisys::core::context().runtime().api()->memcpy_async( + out->data(), + in->data(), + out->numel() * out->elementSize(), + LLAISYS_MEMCPY_D2D, + llaisys::core::context().runtime().stream()); + } + return nvidia::rearrange(out->data(), in->data(), out->shape(), out->strides(), in->strides(), out->dtype(), + llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp new file mode 100644 index 000000000..e9c960307 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp @@ -0,0 +1,60 @@ +#include "rms_norm_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +/** + * RMS Norm 核心计算 + * + * 内存布局 (行优先): + * input/out 视为 [batch, hidden_size] 的二维数组 + * weight 是长度为 hidden_size 的一维数组 + * + * 计算流程 (对每个 batch i): + * 1. 计算 RMS_i = sqrt(mean(input[i][:]^2) + eps) + * 2. out[i][j] = (input[i][j] / RMS_i) * weight[j] + * + * 索引计算: + * - input[i][j] = input[i * hidden_size + j] + * - weight[j] 对所有 batch 共享 + */ +template +void rms_norm_(T *out, const T *input, const T *weight, size_t batch, size_t hidden_size, float eps) { + // 遍历每个独立的向量 (batch 维度) + for (size_t i = 0; i < batch; i++) { + // Step 1: 计算 sum(x^2),在 hidden_size 维度上累加 + float sum_sq = 0.0F; + for (size_t j = 0; j < hidden_size; j++) { + float val = llaisys::utils::cast(input[i * hidden_size + j]); + sum_sq += val * val; + } + // Step 2: 计算 RMS = sqrt(mean(x^2) + eps) + float rms = std::sqrt(sum_sq / static_cast(hidden_size) + eps); + + // Step 3: 归一化并乘以 weight + for (size_t j = 0; j < hidden_size; j++) { + float val = llaisys::utils::cast(input[i * hidden_size + j]); + float wgt = llaisys::utils::cast(weight[j]); + out[i * hidden_size + j] = llaisys::utils::cast((val / rms) * wgt); + } + } +} + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, size_t hidden_size, float eps) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(input), + reinterpret_cast(weight), batch, hidden_size, eps); + case LLAISYS_DTYPE_BF16: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(input), + reinterpret_cast(weight), batch, hidden_size, eps); + case LLAISYS_DTYPE_F16: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(input), + reinterpret_cast(weight), batch, hidden_size, eps); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp new file mode 100644 index 000000000..7f3fa4f97 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, size_t hidden_size, float eps); +} diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu new file mode 100644 index 000000000..fd4b005fe --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -0,0 +1,86 @@ +#include "rms_norm_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void rmsNormKernel(T *out, const T *input, const T *weight, size_t hidden_size, float eps) { + __shared__ float shared_sum[256]; + + const size_t row = static_cast(blockIdx.x); + const size_t row_offset = row * hidden_size; + const int tid = threadIdx.x; + + float sum_sq = 0.0f; + for (size_t col = static_cast(tid); col < hidden_size; col += blockDim.x) { + const float value = llaisys::device::nvidia::scalarToFloat(input[row_offset + col]); + sum_sq += value * value; + } + + shared_sum[tid] = sum_sq; + __syncthreads(); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (tid < offset) { + shared_sum[tid] += shared_sum[tid + offset]; + } + __syncthreads(); + } + + const float inv_rms = rsqrtf(shared_sum[0] / static_cast(hidden_size) + eps); + for (size_t col = static_cast(tid); col < hidden_size; col += blockDim.x) { + const float value = llaisys::device::nvidia::scalarToFloat(input[row_offset + col]); + const float scale = llaisys::device::nvidia::scalarToFloat(weight[col]); + out[row_offset + col] = llaisys::device::nvidia::floatToScalar(value * inv_rms * scale); + } +} + +} // namespace + +void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, + size_t hidden_size, float eps, llaisysStream_t stream) { + if (batch == 0 || hidden_size == 0) { + return; + } + + constexpr int threads = 256; + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + switch (type) { + case LLAISYS_DTYPE_F32: + rmsNormKernel<<(batch), threads, 0, cuda_stream>>>( + reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(weight), + hidden_size, + eps); + break; + case LLAISYS_DTYPE_F16: + rmsNormKernel<<(batch), threads, 0, cuda_stream>>>( + reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(weight), + hidden_size, + eps); + break; + case LLAISYS_DTYPE_BF16: + rmsNormKernel<<(batch), threads, 0, cuda_stream>>>( + reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(weight), + hidden_size, + eps); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh new file mode 100644 index 000000000..3688b93a2 --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh @@ -0,0 +1,19 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, + size_t hidden_size, float eps, llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, size_t hidden_size, float eps); +} diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9d..803c957c3 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,62 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rms_norm_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rms_norm_nvidia.cuh" +#endif + namespace llaisys::ops { +/** + * RMS Norm (Root Mean Square Layer Normalization) + * + * 公式: out[i][j] = (in[i][j] / RMS(in[i])) * weight[j] + * 其中 RMS(x) = sqrt(mean(x^2) + eps) + * + * 参数说明: + * - batch: 需要独立归一化的向量数量,等于 in.numel() / hidden_size + * 例如 in 形状为 [2, 3, 768],则 batch = 2*3 = 6 + * - hidden_size: 每个向量的特征维度,即 in 的最后一个维度 + * RMS 在此维度上计算,weight 的长度也等于 hidden_size + * + * 张量形状: + * - in/out: [..., hidden_size] (任意前导维度,最后一维是 hidden_size) + * - weight: [hidden_size] (所有向量共享同一组 weight) + */ void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + CHECK_SAME_SHAPE(out->shape(), in->shape()); + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), "RmsNorm: all tensors must be contiguous."); + + ASSERT(in->ndim() >= 1, "RmsNorm: in must have at least 1 dimension."); + ASSERT(weight->ndim() == 1, "RmsNorm: weight must be 1D."); + + // hidden_size: in 的最后一个维度,RMS 在此维度上计算 + size_t hidden_size = in->shape().back(); + // batch: 独立归一化的向量数量 (前导维度的乘积) + size_t batch = in->numel() / hidden_size; + + ASSERT(weight->shape()[0] == hidden_size, "RmsNorm: weight.shape[0] must be equal to in's last dimension."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch, hidden_size, eps); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch, hidden_size, eps); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch, hidden_size, eps, + llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp new file mode 100644 index 000000000..1f9734252 --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.cpp @@ -0,0 +1,53 @@ +#include "rope_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include + +template +void rope_(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads, size_t head_dim) { + const size_t half_dim = head_dim / 2; + // Precompute inverse frequencies (denominators) for each dimension j + // To match PyTorch's behavior: phi = pos / (theta ^ (2j/d)) + std::vector inv_freqs(half_dim); + for (size_t j = 0; j < half_dim; ++j) { + inv_freqs[j] = static_cast(std::pow(static_cast(theta), 2.0 * static_cast(j) / static_cast(head_dim))); + } + + for (size_t i = 0; i < seq_len; ++i) { + const float pos = static_cast(pos_ids[i]); + for (size_t h = 0; h < n_heads; ++h) { + const size_t base_idx = (i * n_heads + h) * head_dim; + const T* in_ptr = in + base_idx; + T* out_ptr = out + base_idx; + for (size_t j = 0; j < half_dim; ++j) { + float phi = pos / inv_freqs[j]; + float cos_phi = std::cos(phi); + float sin_phi = std::sin(phi); + // a = in[j], b = in[j + half_dim] + float a = llaisys::utils::cast(in_ptr[j]); + float b = llaisys::utils::cast(in_ptr[j + half_dim]); + float a_prime = a * cos_phi - b * sin_phi; + float b_prime = b * cos_phi + a * sin_phi; + out_ptr[j] = llaisys::utils::cast(a_prime); + out_ptr[j + half_dim] = llaisys::utils::cast(b_prime); + } + } + } +} + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rope_(reinterpret_cast(out), reinterpret_cast(in), pos_ids, theta, seq_len, n_heads, head_dim); + case LLAISYS_DTYPE_BF16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), pos_ids, theta, seq_len, n_heads, head_dim); + case LLAISYS_DTYPE_F16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), pos_ids, theta, seq_len, n_heads, head_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp new file mode 100644 index 000000000..ca300f8bf --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim); +} \ No newline at end of file diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu new file mode 100644 index 000000000..b24016927 --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.cu @@ -0,0 +1,88 @@ +#include "rope_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void ropeKernel(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads, size_t head_dim) { + const size_t half_dim = head_dim / 2; + const size_t total = seq_len * n_heads * half_dim; + const size_t stride = static_cast(blockDim.x) * gridDim.x; + + for (size_t linear_idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < total; linear_idx += stride) { + const size_t j = linear_idx % half_dim; + const size_t tmp = linear_idx / half_dim; + const size_t h = tmp % n_heads; + const size_t i = tmp / n_heads; + const size_t base_idx = (i * n_heads + h) * head_dim; + + const float pos = static_cast(pos_ids[i]); + const float phi = pos / powf(theta, 2.0f * static_cast(j) / static_cast(head_dim)); + const float cos_phi = cosf(phi); + const float sin_phi = sinf(phi); + + const float a = llaisys::device::nvidia::scalarToFloat(in[base_idx + j]); + const float b = llaisys::device::nvidia::scalarToFloat(in[base_idx + j + half_dim]); + out[base_idx + j] = llaisys::device::nvidia::floatToScalar(a * cos_phi - b * sin_phi); + out[base_idx + j + half_dim] = llaisys::device::nvidia::floatToScalar(b * cos_phi + a * sin_phi); + } +} + +} // namespace + +void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim, llaisysStream_t stream) { + const size_t total = seq_len * n_heads * (head_dim / 2); + if (total == 0) { + return; + } + + constexpr int threads = 256; + const int max_blocks = 4096; + const int blocks = static_cast(std::min((total + threads - 1) / threads, static_cast(max_blocks))); + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + switch (type) { + case LLAISYS_DTYPE_F32: + ropeKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + pos_ids, + theta, + seq_len, + n_heads, + head_dim); + break; + case LLAISYS_DTYPE_F16: + ropeKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + pos_ids, + theta, + seq_len, + n_heads, + head_dim); + break; + case LLAISYS_DTYPE_BF16: + ropeKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + pos_ids, + theta, + seq_len, + n_heads, + head_dim); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rope/nvidia/rope_nvidia.cuh b/src/ops/rope/nvidia/rope_nvidia.cuh new file mode 100644 index 000000000..b9f636564 --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.cuh @@ -0,0 +1,20 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim, llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim); +} diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64e..dfe24b350 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,46 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rope_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rope_nvidia.cuh" +#endif + namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, pos_ids); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), "RoPE: all tensors must be contiguous."); + ASSERT(out->shape() == in->shape(), "RoPE: out and in must have same shape."); + ASSERT(out->ndim() == 3, "RoPE: out must be 3D tensor [seq_len, n_heads, head_dim]."); + ASSERT(in->ndim() == 3, "RoPE: in must be 3D tensor [seq_len, n_heads, head_dim]."); + ASSERT(pos_ids->ndim() == 1, "RoPE: pos_ids must be 1D tensor [seq_len]."); + ASSERT(pos_ids->shape()[0] == out->shape()[0], "RoPE: pos_ids length must match seq_len."); + ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be int64."); + ASSERT(out->shape()[2] % 2 == 0, "RoPE: head_dim must be even."); + + size_t seq_len = out->shape()[0]; + size_t n_heads = out->shape()[1]; + size_t head_dim = out->shape()[2]; + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope(out->data(), in->data(), reinterpret_cast(pos_ids->data()), theta, out->dtype(), seq_len, n_heads, head_dim); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rope(out->data(), in->data(), reinterpret_cast(pos_ids->data()), theta, out->dtype(), seq_len, n_heads, head_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rope(out->data(), in->data(), reinterpret_cast(pos_ids->data()), theta, out->dtype(), seq_len, + n_heads, head_dim, llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp new file mode 100644 index 000000000..ad735cea0 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp @@ -0,0 +1,113 @@ +#include "self_attention_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include + +template +void self_attention_(T *attn_val, const T *q, const T *k, const T *v, + float scale, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim) { + // Calculate the group size for GQA (Grouped Query Attention) + // Each kv head is shared by (nhead / nkvhead) query heads + const size_t group_size = nhead / nkvhead; + + // For each query position + for (size_t qi = 0; qi < qlen; ++qi) { + // For each attention head + for (size_t h = 0; h < nhead; ++h) { + // Find the corresponding kv head for this query head + const size_t kv_h = h / group_size; + + // Compute attention scores for this query position and head + // Q[qi, h, :] @ K[:, kv_h, :].T -> scores[kvlen] + std::vector scores(kvlen); + + // Calculate Q @ K^T * scale + for (size_t ki = 0; ki < kvlen; ++ki) { + double score = 0.0; + for (size_t d = 0; d < head_dim; ++d) { + // q index: [qi, h, d] -> qi * nhead * head_dim + h * head_dim + d + // k index: [ki, kv_h, d] -> ki * nkvhead * head_dim + kv_h * head_dim + d + double q_val = llaisys::utils::cast(q[qi * nhead * head_dim + h * head_dim + d]); + double k_val = llaisys::utils::cast(k[ki * nkvhead * head_dim + kv_h * head_dim + d]); + score += q_val * k_val; + } + scores[ki] = score * static_cast(scale); + } + + // Apply causal mask: for position qi in query, we can only attend to positions 0..qi+(kvlen-qlen) + // The causal mask is: temp_mask = torch.ones(L, S).tril(diagonal=S-L) + // attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) + // This means: for query position qi, we can attend to key positions 0 to (kvlen - qlen + qi) + const size_t max_attend = kvlen - qlen + qi; + for (size_t ki = max_attend + 1; ki < kvlen; ++ki) { + scores[ki] = -std::numeric_limits::infinity(); + } + + // Compute softmax + // Find max for numerical stability + double max_score = scores[0]; + for (size_t ki = 1; ki < kvlen; ++ki) { + if (scores[ki] > max_score) { + max_score = scores[ki]; + } + } + + // Compute exp and sum + double sum_exp = 0.0; + for (size_t ki = 0; ki < kvlen; ++ki) { + scores[ki] = std::exp(scores[ki] - max_score); + sum_exp += scores[ki]; + } + + // Normalize + for (size_t ki = 0; ki < kvlen; ++ki) { + scores[ki] /= sum_exp; + } + + // Compute attention output: softmax(scores) @ V + // attn_val[qi, h, :] = sum over ki of (scores[ki] * V[ki, kv_h, :]) + for (size_t d = 0; d < head_dim; ++d) { + double out_val = 0.0; + for (size_t ki = 0; ki < kvlen; ++ki) { + // v index: [ki, kv_h, d] -> ki * nkvhead * head_dim + kv_h * head_dim + d + double v_val = llaisys::utils::cast(v[ki * nkvhead * head_dim + kv_h * head_dim + d]); + out_val += scores[ki] * v_val; + } + // attn_val index: [qi, h, d] -> qi * nhead * head_dim + h * head_dim + d + attn_val[qi * nhead * head_dim + h * head_dim + d] = llaisys::utils::cast(out_val); + } + } + } +} + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + float scale, llaisysDataType_t type, + size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return self_attention_(reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nhead, nkvhead, head_dim); + case LLAISYS_DTYPE_BF16: + return self_attention_(reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nhead, nkvhead, head_dim); + case LLAISYS_DTYPE_F16: + return self_attention_(reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nhead, nkvhead, head_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp new file mode 100644 index 000000000..83e56b126 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + float scale, llaisysDataType_t type, + size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim); +} diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu new file mode 100644 index 000000000..af559e99b --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu @@ -0,0 +1,145 @@ +#include "self_attention_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void selfAttentionKernel(T *attn_val, const T *q, const T *k, const T *v, float scale, size_t qlen, size_t kvlen, + size_t nhead, size_t nkvhead, size_t head_dim) { + extern __shared__ float shared_mem[]; + float *scores = shared_mem; + float *reduce_buf = shared_mem + kvlen; + + const size_t qi = static_cast(blockIdx.x); + const size_t h = static_cast(blockIdx.y); + const size_t group_size = nhead / nkvhead; + const size_t kv_h = h / group_size; + const int tid = threadIdx.x; + + const size_t q_base = (qi * nhead + h) * head_dim; + const size_t max_attend = kvlen - qlen + qi; + + float thread_max = -INFINITY; + for (size_t ki = static_cast(tid); ki < kvlen; ki += blockDim.x) { + float score = -INFINITY; + if (ki <= max_attend) { + score = 0.0f; + const size_t k_base = (ki * nkvhead + kv_h) * head_dim; + for (size_t d = 0; d < head_dim; ++d) { + score += llaisys::device::nvidia::scalarToFloat(q[q_base + d]) + * llaisys::device::nvidia::scalarToFloat(k[k_base + d]); + } + score *= scale; + } + scores[ki] = score; + if (score > thread_max) { + thread_max = score; + } + } + + reduce_buf[tid] = thread_max; + __syncthreads(); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (tid < offset) { + reduce_buf[tid] = fmaxf(reduce_buf[tid], reduce_buf[tid + offset]); + } + __syncthreads(); + } + const float max_score = reduce_buf[0]; + + float thread_sum = 0.0f; + for (size_t ki = static_cast(tid); ki < kvlen; ki += blockDim.x) { + const float prob = expf(scores[ki] - max_score); + scores[ki] = prob; + thread_sum += prob; + } + + reduce_buf[tid] = thread_sum; + __syncthreads(); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (tid < offset) { + reduce_buf[tid] += reduce_buf[tid + offset]; + } + __syncthreads(); + } + const float inv_sum = 1.0f / reduce_buf[0]; + + for (size_t d = static_cast(tid); d < head_dim; d += blockDim.x) { + float out_val = 0.0f; + for (size_t ki = 0; ki < kvlen; ++ki) { + const size_t v_base = (ki * nkvhead + kv_h) * head_dim; + out_val += (scores[ki] * inv_sum) * llaisys::device::nvidia::scalarToFloat(v[v_base + d]); + } + attn_val[q_base + d] = llaisys::device::nvidia::floatToScalar(out_val); + } +} + +} // namespace + +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, + llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim, + llaisysStream_t stream) { + if (qlen == 0 || kvlen == 0 || nhead == 0 || nkvhead == 0 || head_dim == 0) { + return; + } + + constexpr int threads = 128; + const dim3 grid(static_cast(qlen), static_cast(nhead), 1); + const size_t shared_bytes = (kvlen + threads) * sizeof(float); + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + switch (type) { + case LLAISYS_DTYPE_F32: + selfAttentionKernel<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, + qlen, + kvlen, + nhead, + nkvhead, + head_dim); + break; + case LLAISYS_DTYPE_F16: + selfAttentionKernel<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, + qlen, + kvlen, + nhead, + nkvhead, + head_dim); + break; + case LLAISYS_DTYPE_BF16: + selfAttentionKernel<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, + qlen, + kvlen, + nhead, + nkvhead, + head_dim); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cuh b/src/ops/self_attention/nvidia/self_attention_nvidia.cuh new file mode 100644 index 000000000..b702a8082 --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cuh @@ -0,0 +1,20 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, + llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim, + llaisysStream_t stream); +} +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim); +} diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d620142..dd5e12686 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,65 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/self_attention_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/self_attention_nvidia.cuh" +#endif + namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(attn_val, q, k, v); + CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype()); + ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(), + "SelfAttention: all tensors must be contiguous."); + + // Shape checks + // q: [qlen, nhead, head_dim] + // k: [kvlen, nkvhead, head_dim] + // v: [kvlen, nkvhead, head_dim] + // attn_val: [qlen, nhead, head_dim] + ASSERT(q->ndim() == 3, "SelfAttention: q must be 3D tensor [qlen, nhead, head_dim]."); + ASSERT(k->ndim() == 3, "SelfAttention: k must be 3D tensor [kvlen, nkvhead, head_dim]."); + ASSERT(v->ndim() == 3, "SelfAttention: v must be 3D tensor [kvlen, nkvhead, head_dim]."); + ASSERT(attn_val->ndim() == 3, "SelfAttention: attn_val must be 3D tensor [qlen, nhead, head_dim]."); + + size_t qlen = q->shape()[0]; + size_t nhead = q->shape()[1]; + size_t head_dim = q->shape()[2]; + size_t kvlen = k->shape()[0]; + size_t nkvhead = k->shape()[1]; + + // Validate shapes + ASSERT(k->shape()[0] == v->shape()[0], "SelfAttention: k and v must have same kvlen."); + ASSERT(k->shape()[1] == v->shape()[1], "SelfAttention: k and v must have same nkvhead."); + ASSERT(k->shape()[2] == head_dim, "SelfAttention: k head_dim must match q head_dim."); + ASSERT(v->shape()[2] == head_dim, "SelfAttention: v head_dim must match q head_dim."); + ASSERT(attn_val->shape()[0] == qlen, "SelfAttention: attn_val qlen must match q qlen."); + ASSERT(attn_val->shape()[1] == nhead, "SelfAttention: attn_val nhead must match q nhead."); + ASSERT(attn_val->shape()[2] == head_dim, "SelfAttention: attn_val head_dim must match q head_dim."); + ASSERT(nhead % nkvhead == 0, "SelfAttention: nhead must be divisible by nkvhead for GQA."); + ASSERT(kvlen >= qlen, "SelfAttention: kvlen must be >= qlen."); + + if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + scale, attn_val->dtype(), qlen, kvlen, nhead, nkvhead, head_dim); + } + + llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId()); + + switch (attn_val->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + scale, attn_val->dtype(), qlen, kvlen, nhead, nkvhead, head_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, attn_val->dtype(), qlen, kvlen, + nhead, nkvhead, head_dim, llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp new file mode 100644 index 000000000..e34414acd --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp @@ -0,0 +1,51 @@ +#include "swiglu_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +// SwiGLU: out = up * silu(gate) +// where silu(x) = x * sigmoid(x) = x / (1 + exp(-x)) +// Formula: out_i = up_i * gate_i / (1 + exp(-gate_i)) + +template +void swiglu_(T *out, const T *gate, const T *up, size_t numel) { + for (size_t i = 0; i < numel; ++i) { + // Convert to double for numerical stability + double gate_val = llaisys::utils::cast(gate[i]); + double up_val = llaisys::utils::cast(up[i]); + + // silu(gate) = gate / (1 + exp(-gate)) + double silu_gate = gate_val / (1.0 + std::exp(-gate_val)); + + // out = up * silu(gate) + double out_val = up_val * silu_gate; + + out[i] = llaisys::utils::cast(out_val); + } +} + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, + llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return swiglu_(reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + case LLAISYS_DTYPE_BF16: + return swiglu_(reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + case LLAISYS_DTYPE_F16: + return swiglu_(reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp new file mode 100644 index 000000000..31debbec1 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, + llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu new file mode 100644 index 000000000..928dbd5ff --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -0,0 +1,63 @@ +#include "swiglu_nvidia.cuh" + +#include "../../../device/nvidia/cuda_utils.cuh" + +#include +#include + +namespace llaisys::ops::nvidia { +namespace { + +template +__global__ void swigluKernel(T *out, const T *gate, const T *up, size_t numel) { + const size_t stride = static_cast(blockDim.x) * gridDim.x; + for (size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += stride) { + const float gate_val = llaisys::device::nvidia::scalarToFloat(gate[idx]); + const float up_val = llaisys::device::nvidia::scalarToFloat(up[idx]); + const float silu = gate_val / (1.0f + expf(-gate_val)); + out[idx] = llaisys::device::nvidia::floatToScalar(up_val * silu); + } +} + +} // namespace + +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel, llaisysStream_t stream) { + if (numel == 0) { + return; + } + + constexpr int threads = 256; + const int max_blocks = 4096; + const int blocks = static_cast(std::min((numel + threads - 1) / threads, static_cast(max_blocks))); + const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream); + + switch (type) { + case LLAISYS_DTYPE_F32: + swigluKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + break; + case LLAISYS_DTYPE_F16: + swigluKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + break; + case LLAISYS_DTYPE_BF16: + swigluKernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + CUDA_CHECK(cudaGetLastError()); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cuh b/src/ops/swiglu/nvidia/swiglu_nvidia.cuh new file mode 100644 index 000000000..73d853457 --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cuh @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel, llaisysStream_t stream); +} diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 47edbcc97..521e54d6c 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -1,7 +1,37 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/swiglu_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/swiglu_nvidia.cuh" +#endif + namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, gate, up); + CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape()); + CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype()); + ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(), + "SwiGLU: all tensors must be contiguous."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel(), + llaisys::core::context().runtime().stream()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 2f594bb65..a0e538aaa 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -1,10 +1,14 @@ #include "tensor.hpp" #include "../utils.hpp" +#include "llaisys.h" +#include #include #include #include +#include +#include namespace llaisys { @@ -164,27 +168,92 @@ void Tensor::debug() const { } bool Tensor::isContiguous() const { - TO_BE_IMPLEMENTED(); + // TO_BE_IMPLEMENTED(); + // row-major order, contiguous in memory + if (ndim() == 0) { // ndim == 0 means scalar, contiguous in memory + return true; + } + ptrdiff_t expected_stride = 1; + for (size_t i = ndim(); i > 0; i--) { + // be attentive to the index of the dimension, overflow problem + size_t dim = i - 1; + if (strides()[dim] != expected_stride) { + return false; + } + expected_stride *= static_cast(shape()[dim]); + } return true; } tensor_t Tensor::permute(const std::vector &order) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // TO_BE_IMPLEMENTED(); + std::vector new_shape(order.size(), 0); + std::vector new_strides(order.size(), 0); + for (size_t i = 0; i < order.size(); i++) { + size_t dim = order[i]; + new_shape[i] = shape()[dim]; + new_strides[i] = strides()[dim]; + } + TensorMeta new_meta{dtype(), new_shape, new_strides}; + return std::shared_ptr(new Tensor(new_meta, _storage, _offset)); } tensor_t Tensor::view(const std::vector &shape) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // TO_BE_IMPLEMENTED(); + // 1.must be contiguous + if (!isContiguous()) { + throw std::runtime_error("Can't view non-contiguous tensor."); + } + + // 2.new shape must have the same number of elements as the original tensor + size_t new_numel = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies()); + if (new_numel != numel()) { + throw std::runtime_error("Can't view tensor with different number of elements."); + } + + // 3.calculate new strides + size_t new_ndim = shape.size(); + std::vector new_strides(new_ndim); + ptrdiff_t stride = 1; + for (size_t i = new_ndim; i > 0; i--) { + size_t dim = i - 1; + new_strides[dim] = stride; + stride *= static_cast(shape[dim]); + } + + // 4.create new tensor + TensorMeta new_meta{dtype(), shape, new_strides}; + return std::shared_ptr(new Tensor(new_meta, _storage, _offset)); } tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // TO_BE_IMPLEMENTED(); + if (dim >= ndim() || start >= shape()[dim] || end > shape()[dim] || start >= end) { + throw std::runtime_error("Can't slice out of range."); + } + std::vector new_shape = shape(); + std::vector new_strides = strides(); + + new_shape[dim] = end - start; + TensorMeta new_meta{dtype(), new_shape, strides()}; + size_t new_offset = _offset + start * strides()[dim] * elementSize(); + return std::shared_ptr(new Tensor(new_meta, _storage, new_offset)); } void Tensor::load(const void *src_) { - TO_BE_IMPLEMENTED(); + // TO_BE_IMPLEMENTED(); + size_t total_bytes = numel() * elementSize(); + core::context().setDevice(deviceType(), deviceId()); + + if (!isContiguous()) { + throw std::runtime_error("Can't load into non-contiguous tensor."); + } + + if (deviceType() == LLAISYS_DEVICE_CPU) { + std::memcpy(data(), src_, total_bytes); + } else { + core::context().runtime().api()->memcpy_sync(data(), src_, total_bytes, LLAISYS_MEMCPY_H2D); + } } tensor_t Tensor::contiguous() const { diff --git a/test/ops/self_attention.py b/test/ops/self_attention.py index a042b51be..abf3927a8 100644 --- a/test/ops/self_attention.py +++ b/test/ops/self_attention.py @@ -15,7 +15,7 @@ def torch_self_attention(attn_val, query, key, value, scale): L, S = query.size(-2), key.size(-2) attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device) - temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=S-L) + temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).tril(diagonal=S-L) attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) attn_bias.to(query.dtype) diff --git a/test/test_dist.py b/test/test_dist.py new file mode 100644 index 000000000..556b94735 --- /dev/null +++ b/test/test_dist.py @@ -0,0 +1,95 @@ +import argparse +import os +import tempfile +import uuid + +import llaisys +import torch +import torch.multiprocessing as mp + + +def tensor_from_torch(torch_tensor: torch.Tensor, rank: int) -> llaisys.Tensor: + tensor = llaisys.Tensor( + torch_tensor.shape, + dtype=llaisys.DataType.F32, + device=llaisys.DeviceType.NVIDIA, + device_id=rank, + ) + api = llaisys.RuntimeAPI(llaisys.DeviceType.NVIDIA) + api.set_device(rank) + api.memcpy_sync( + tensor.data_ptr(), + torch_tensor.data_ptr(), + torch_tensor.numel() * torch_tensor.element_size(), + llaisys.MemcpyKind.D2D, + ) + return tensor + + +def torch_from_tensor(tensor: llaisys.Tensor, shape, rank: int) -> torch.Tensor: + result = torch.empty(shape, dtype=torch.float32, device=torch.device(f"cuda:{rank}")) + api = llaisys.RuntimeAPI(llaisys.DeviceType.NVIDIA) + api.set_device(rank) + api.memcpy_sync( + result.data_ptr(), + tensor.data_ptr(), + result.numel() * result.element_size(), + llaisys.MemcpyKind.D2D, + ) + return result + + +def worker(rank: int, world_size: int) -> None: + api = llaisys.RuntimeAPI(llaisys.DeviceType.NVIDIA) + api.set_device(rank) + dist = llaisys.DistributedContext() + dist.init(rank, world_size) + + local = torch.full((4,), float(rank + 1), dtype=torch.float32, device=torch.device(f"cuda:{rank}")) + reduced = tensor_from_torch(local.clone(), rank) + dist.all_reduce(reduced) + reduced_result = torch_from_tensor(reduced, local.shape, rank) + expected_reduce = torch.full_like(local, float(world_size * (world_size + 1) // 2)) + torch.testing.assert_close(reduced_result, expected_reduce) + + gathered = dist.all_gather(tensor_from_torch(local, rank)) + gathered_result = torch_from_tensor(gathered, (world_size, 4), rank) + expected_gather = torch.stack( + [ + torch.full((4,), float(peer_rank + 1), dtype=torch.float32, device=torch.device(f"cuda:{rank}")) + for peer_rank in range(world_size) + ] + ) + torch.testing.assert_close(gathered_result, expected_gather) + + broadcast_source = ( + torch.tensor([10.0, 11.0, 12.0, 13.0], dtype=torch.float32, device=torch.device(f"cuda:{rank}")) + if rank == 0 + else torch.zeros((4,), dtype=torch.float32, device=torch.device(f"cuda:{rank}")) + ) + broadcast_tensor = tensor_from_torch(broadcast_source, rank) + dist.broadcast(broadcast_tensor, root=0) + broadcast_result = torch_from_tensor(broadcast_tensor, (4,), rank) + expected_broadcast = torch.tensor([10.0, 11.0, 12.0, 13.0], dtype=torch.float32, device=torch.device(f"cuda:{rank}")) + torch.testing.assert_close(broadcast_result, expected_broadcast) + + dist.barrier() + dist.finalize() + api.device_synchronize() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="nvidia", choices=["nvidia"], type=str) + parser.add_argument("--world-size", default=8, type=int) + args = parser.parse_args() + + assert args.device == "nvidia" + assert torch.cuda.device_count() >= args.world_size + + os.environ["LLAISYS_DIST_BOOTSTRAP_PATH"] = os.path.join( + tempfile.gettempdir(), f"llaisys_nccl_{uuid.uuid4().hex}.bin" + ) + mp.spawn(worker, args=(args.world_size,), nprocs=args.world_size, join=True) + + print("\033[92mDistributed test passed!\033[0m\n") diff --git a/test/test_infer.py b/test/test_infer.py index 59d06b874..7c3ebb203 100644 --- a/test/test_infer.py +++ b/test/test_infer.py @@ -55,7 +55,7 @@ def hf_infer( def load_llaisys_model(model_path, device_name): - model = llaisys.models.Qwen2(model_path, llaisys_device(device_name)) + model = llaisys.models.Qwen2(model_path, llaisys_device(device_name), device_id=0, rank=0, world_size=1) return model diff --git a/test/test_infer_dist.py b/test/test_infer_dist.py new file mode 100644 index 000000000..1c5aaa38b --- /dev/null +++ b/test/test_infer_dist.py @@ -0,0 +1,107 @@ +import argparse +import gc +import os +import tempfile +import time +import uuid + +import torch +import torch.multiprocessing as mp +from transformers import AutoTokenizer + +import llaisys + + +def build_input_tokens(tokenizer, prompt: str): + input_content = tokenizer.apply_chat_template( + conversation=[{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + return tokenizer.encode(input_content) + + +def worker(rank: int, world_size: int, model_path: str, inputs, max_new_tokens: int, result_holder): + dist = llaisys.DistributedContext() + dist.init(rank, world_size) + + load_start = time.time() + model = llaisys.models.Qwen2( + model_path, + llaisys.DeviceType.NVIDIA, + device_id=rank, + rank=rank, + world_size=world_size, + ) + load_elapsed = time.time() - load_start + + dist.barrier() + generate_start = time.time() + outputs = model.generate(inputs, max_new_tokens=max_new_tokens, top_k=1, top_p=1.0, temperature=1.0) + dist.barrier() + generate_elapsed = time.time() - generate_start + + if rank == 0: + result_holder["tokens"] = outputs + result_holder["load_latency"] = load_elapsed + result_holder["generate_latency"] = generate_elapsed + + del model + dist.finalize() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="nvidia", choices=["nvidia"], type=str) + parser.add_argument("--model", required=True, type=str) + parser.add_argument("--prompt", default="Who are you?", type=str) + parser.add_argument("--max_steps", default=32, type=int) + parser.add_argument("--world-size", default=8, type=int) + parser.add_argument("--test", action="store_true") + args = parser.parse_args() + + assert torch.cuda.device_count() >= args.world_size + + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + inputs = build_input_tokens(tokenizer, args.prompt) + + single_load_start = time.time() + single_model = llaisys.models.Qwen2( + args.model, + llaisys.DeviceType.NVIDIA, + device_id=0, + rank=0, + world_size=1, + ) + single_load_latency = time.time() - single_load_start + start_time = time.time() + single_tokens = single_model.generate(inputs, max_new_tokens=args.max_steps, top_k=1, top_p=1.0, temperature=1.0) + single_generate_latency = time.time() - start_time + del single_model + gc.collect() + torch.cuda.empty_cache() + + os.environ["LLAISYS_DIST_BOOTSTRAP_PATH"] = os.path.join( + tempfile.gettempdir(), f"llaisys_qwen2_tp_{uuid.uuid4().hex}.bin" + ) + + manager = mp.Manager() + result_holder = manager.dict() + mp.spawn(worker, args=(args.world_size, args.model, inputs, args.max_steps, result_holder), nprocs=args.world_size, join=True) + + dist_tokens = list(result_holder["tokens"]) + dist_load_latency = float(result_holder["load_latency"]) + dist_generate_latency = float(result_holder["generate_latency"]) + + print("Single-card tokens:") + print(single_tokens) + print(f"Single-card load latency: {single_load_latency:.2f}s") + print(f"Single-card generate latency: {single_generate_latency:.2f}s") + print("\nDistributed tokens:") + print(dist_tokens) + print(f"Distributed load latency: {dist_load_latency:.2f}s") + print(f"Distributed generate latency: {dist_generate_latency:.2f}s") + + if args.test: + assert dist_tokens == single_tokens + print("\033[92mDistributed inference test passed!\033[0m\n") diff --git a/xmake.lua b/xmake.lua index 1f65f7a95..7fd7cd651 100644 --- a/xmake.lua +++ b/xmake.lua @@ -37,6 +37,9 @@ target("llaisys-device") set_kind("static") add_deps("llaisys-utils") add_deps("llaisys-device-cpu") + if has_config("nv-gpu") then + add_deps("llaisys-device-nvidia") + end set_languages("cxx17") set_warnings("all", "error") @@ -83,6 +86,9 @@ target_end() target("llaisys-ops") set_kind("static") add_deps("llaisys-ops-cpu") + if has_config("nv-gpu") then + add_deps("llaisys-ops-nvidia") + end set_languages("cxx17") set_warnings("all", "error") @@ -95,6 +101,22 @@ target("llaisys-ops") on_install(function (target) end) target_end() +target("llaisys-models") + set_kind("static") + add_deps("llaisys-tensor") + add_deps("llaisys-ops") + + set_languages("cxx17") + set_warnings("all", "error") + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + end + + add_files("src/models/*/*.cpp") + + on_install(function (target) end) +target_end() + target("llaisys") set_kind("shared") add_deps("llaisys-utils") @@ -102,6 +124,7 @@ target("llaisys") add_deps("llaisys-core") add_deps("llaisys-tensor") add_deps("llaisys-ops") + add_deps("llaisys-models") set_languages("cxx17") set_warnings("all", "error") diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua new file mode 100644 index 000000000..5c87bf794 --- /dev/null +++ b/xmake/nvidia.lua @@ -0,0 +1,35 @@ +local function config_nvidia_target() + add_rules("cuda") + set_values("cuda.build.devlink", true) + set_languages("cxx17") + set_warnings("all", "error") + add_cugencodes("native") + add_cuflags("--extended-lambda", {force = true}) + add_cuflags("--expt-relaxed-constexpr", {force = true}) + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + add_cuflags("-Xcompiler=-fPIC", {force = true}) + add_culdflags("-Xcompiler -fPIC", {force = true}) + end + add_links("cudart", "cublas", "nccl") +end + +target("llaisys-device-nvidia") + set_kind("static") + config_nvidia_target() + add_files("../src/device/nvidia/*.cu") + + on_install(function (target) end) +target_end() + +target("llaisys-ops-nvidia") + set_kind("static") + add_deps("llaisys-tensor") + config_nvidia_target() + + for _, file in ipairs(os.files("../src/ops/*/nvidia/*.cu")) do + add_files(file) + end + + on_install(function (target) end) +target_end()