diff --git a/include/llaisys/runtime.h b/include/llaisys/runtime.h
index d8e6f66f1..d59f95fc6 100644
--- a/include/llaisys/runtime.h
+++ b/include/llaisys/runtime.h
@@ -2,6 +2,7 @@
 #define LLAISYS_RUNTIME_H
 
 #include "../llaisys.h"
+#include "tensor.h"
 
 __C {
     // Runtime API Functions
@@ -42,6 +43,18 @@ __C {
 
     // Llaisys API for switching device context
     __export void llaisysSetContextRuntime(llaisysDeviceType_t, int);
+
+    // Distributed runtime APIs
+    __export void llaisysInitDistributed(int rank, int world_size);
+    __export void llaisysFinalizeDistributed();
+    __export uint8_t llaisysDistributedIsInitialized();
+    __export int llaisysDistributedRank();
+    __export int llaisysDistributedWorldSize();
+
+    __export void llaisysDistAllReduce(llaisysTensor_t tensor);
+    __export llaisysTensor_t llaisysDistAllGather(llaisysTensor_t tensor);
+    __export void llaisysDistBroadcast(llaisysTensor_t tensor, int root);
+    __export void llaisysDistBarrier();
 }
 
 #endif // LLAISYS_RUNTIME_H
diff --git a/python/llaisys/__init__.py b/python/llaisys/__init__.py
index de8d99f48..5347bd096 100644
--- a/python/llaisys/__init__.py
+++ b/python/llaisys/__init__.py
@@ -1,4 +1,5 @@
 from .runtime import RuntimeAPI
+from .runtime import DistributedContext
 from .libllaisys import DeviceType
 from .libllaisys import DataType
 from .libllaisys import MemcpyKind
@@ -10,6 +11,7 @@
 
 __all__ = [
     "RuntimeAPI",
+    "DistributedContext",
     "DeviceType",
     "DataType",
     "MemcpyKind",
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb527..f8c630b98 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -2,6 +2,7 @@
 import sys
 import ctypes
 from pathlib import Path
+import torch
 
 from .runtime import load_runtime
 from .runtime import LlaisysRuntimeAPI
@@ -12,6 +13,8 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
+from .models import load_models
+from .models import LlaisysQwen2Meta, LlaisysQwen2Weights, llaisysQwen2Model_t
 
 
 def load_shared_library():
@@ -38,6 +41,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_models(LIB_LLAISYS)
 
 
 __all__ = [
@@ -52,4 +56,7 @@ def load_shared_library():
     "llaisysMemcpyKind_t",
     "MemcpyKind",
     "llaisysStream_t",
+    "LlaisysQwen2Meta",
+    "LlaisysQwen2Weights",
+    "llaisysQwen2Model_t",
 ]
diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py
new file mode 100644
index 000000000..68ea9190d
--- /dev/null
+++ b/python/llaisys/libllaisys/models.py
@@ -0,0 +1,72 @@
+import ctypes
+from ctypes import POINTER, c_void_p, c_size_t, c_int, c_int64, c_float, Structure
+from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t
+from .tensor import llaisysTensor_t
+
+
+# Model handle type
+llaisysQwen2Model_t = c_void_p
+
+
+class LlaisysQwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+
+class LlaisysQwen2Weights(Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", POINTER(llaisysTensor_t)),
+        ("attn_q_w", POINTER(llaisysTensor_t)),
+        ("attn_q_b", POINTER(llaisysTensor_t)),
+        ("attn_k_w", POINTER(llaisysTensor_t)),
+        ("attn_k_b", POINTER(llaisysTensor_t)),
+        ("attn_v_w", POINTER(llaisysTensor_t)),
+        ("attn_v_b", POINTER(llaisysTensor_t)),
+        ("attn_o_w", POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", POINTER(llaisysTensor_t)),
+        ("mlp_up_w", POINTER(llaisysTensor_t)),
+        ("mlp_down_w", POINTER(llaisysTensor_t)),
+    ]
+
+
+def load_models(lib):
+    # llaisysQwen2ModelCreate
+    lib.llaisysQwen2ModelCreate.argtypes = [
+        POINTER(LlaisysQwen2Meta),  # meta
+        llaisysDeviceType_t,        # device
+        POINTER(c_int),             # device_ids
+        c_int,                      # ndevice
+    ]
+    lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t
+
+    # llaisysQwen2ModelDestroy
+    lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    # llaisysQwen2ModelWeights
+    lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights)
+
+    # llaisysQwen2ModelInfer
+    lib.llaisysQwen2ModelInfer.argtypes = [
+        llaisysQwen2Model_t,  # model
+        POINTER(c_int64),     # token_ids
+        c_size_t,             # ntoken
+    ]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
diff --git a/python/llaisys/libllaisys/runtime.py b/python/llaisys/libllaisys/runtime.py
index 3e5b8be5b..b0604c61e 100644
--- a/python/llaisys/libllaisys/runtime.py
+++ b/python/llaisys/libllaisys/runtime.py
@@ -1,6 +1,7 @@
 import ctypes
 from ctypes import c_void_p, c_size_t, c_int, Structure, CFUNCTYPE
 from .llaisys_types import *
+from .tensor import llaisysTensor_t
 
 # Define function pointer types
 get_device_count_api = CFUNCTYPE(c_int)
@@ -46,3 +47,30 @@ def load_runtime(lib):
 
     lib.llaisysSetContextRuntime.argtypes = [llaisysDeviceType_t, c_int]
     lib.llaisysSetContextRuntime.restype = None
+
+    lib.llaisysInitDistributed.argtypes = [c_int, c_int]
+    lib.llaisysInitDistributed.restype = None
+
+    lib.llaisysFinalizeDistributed.argtypes = []
+    lib.llaisysFinalizeDistributed.restype = None
+
+    lib.llaisysDistributedIsInitialized.argtypes = []
+    lib.llaisysDistributedIsInitialized.restype = ctypes.c_uint8
+
+    lib.llaisysDistributedRank.argtypes = []
+    lib.llaisysDistributedRank.restype = c_int
+
+    lib.llaisysDistributedWorldSize.argtypes = []
+    lib.llaisysDistributedWorldSize.restype = c_int
+
+    lib.llaisysDistAllReduce.argtypes = [llaisysTensor_t]
+    lib.llaisysDistAllReduce.restype = None
+
+    lib.llaisysDistAllGather.argtypes = [llaisysTensor_t]
+    lib.llaisysDistAllGather.restype = llaisysTensor_t
+
+    lib.llaisysDistBroadcast.argtypes = [llaisysTensor_t, c_int]
+    lib.llaisysDistBroadcast.restype = None
+
+    lib.llaisysDistBarrier.argtypes = []
+    lib.llaisysDistBarrier.restype = None
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b21..5c21189e0 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,23 +1,203 @@
-from typing import Sequence
+from typing import Sequence, List
 from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
+from ..libllaisys import DeviceType, DataType
+from ..libllaisys import LlaisysQwen2Meta, LlaisysQwen2Weights
 
 from pathlib import Path
 import safetensors
+import json
+import ctypes
+import re
 
 
 class Qwen2:
 
-    def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
+    def __init__(
+        self,
+        model_path,
+        device: DeviceType = DeviceType.CPU,
+        device_id: int = 0,
+        rank: int = 0,
+        world_size: int = 1,
+    ):
         model_path = Path(model_path)
-
+        
+        # Load config
+        config_path = model_path / "config.json"
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        
+        # Extract config values
+        self.num_layers = config["num_hidden_layers"]  # 28
+        self.hidden_size = config["hidden_size"]       # 1536
+        self.num_heads = config["num_attention_heads"] # 12
+        self.num_kv_heads = config["num_key_value_heads"]  # 2
+        self.head_dim = self.hidden_size // self.num_heads  # 128
+        self.intermediate_size = config["intermediate_size"]  # 8960
+        self.max_seq_len = config["max_position_embeddings"]  # 131072
+        self.vocab_size = config["vocab_size"]  # 151936
+        self.rms_norm_eps = config["rms_norm_eps"]  # 1e-6
+        self.rope_theta = config["rope_theta"]  # 10000
+        self.eos_token_id = config["eos_token_id"]  # 151643
+        
+        # Map torch dtype to llaisys dtype
+        torch_dtype = config.get("torch_dtype", "float32")
+        if torch_dtype == "bfloat16":
+            self.dtype = DataType.BF16
+        elif torch_dtype == "float16":
+            self.dtype = DataType.F16
+        else:
+            self.dtype = DataType.F32
+        
+        self.device = device
+        self.device_id = device_id
+        self.rank = rank
+        self.world_size = world_size
+        
+        # Create model meta
+        meta = LlaisysQwen2Meta()
+        meta.dtype = self.dtype
+        meta.nlayer = self.num_layers
+        meta.hs = self.hidden_size
+        meta.nh = self.num_heads
+        meta.nkvh = self.num_kv_heads
+        meta.dh = self.head_dim
+        meta.di = self.intermediate_size
+        meta.maxseq = self.max_seq_len
+        meta.voc = self.vocab_size
+        meta.epsilon = self.rms_norm_eps
+        meta.theta = self.rope_theta
+        meta.end_token = self.eos_token_id
+        
+        # Create model
+        device_ids = (ctypes.c_int * 1)(device_id)
+        self._model = LIB_LLAISYS.llaisysQwen2ModelCreate(
+            ctypes.byref(meta),
+            device,
+            device_ids,
+            1
+        )
+        
+        # Get weights pointer
+        self._weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model).contents
+        
+        # Load weights from safetensors
+        self._load_weights(model_path)
+    
+    def __del__(self):
+        if hasattr(self, "_model") and self._model is not None:
+            LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model)
+            self._model = None
+    
+    def _load_weights(self, model_path: Path):
+        """Load weights from safetensors files."""
         for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+            data = safetensors.safe_open(file, framework="pt", device="cpu")
+            for name in data.keys():
+                tensor = data.get_tensor(name)
+                self._load_weight(name, tensor)
+    
+    def _load_weight(self, name: str, tensor):
+        """Load a single weight tensor."""
+        import torch
+        
+        # Convert to contiguous and get raw data pointer
+        tensor = self._slice_weight(name, tensor).contiguous()
+        data_ptr = tensor.data_ptr()
+        
+        # Map weight name to model weight
+        if name == "model.embed_tokens.weight":
+            LIB_LLAISYS.tensorLoad(self._weights.in_embed, ctypes.c_void_p(data_ptr))
+        elif name == "lm_head.weight":
+            LIB_LLAISYS.tensorLoad(self._weights.out_embed, ctypes.c_void_p(data_ptr))
+        elif name == "model.norm.weight":
+            LIB_LLAISYS.tensorLoad(self._weights.out_norm_w, ctypes.c_void_p(data_ptr))
+        else:
+            # Parse layer index from name like "model.layers.0.xxx"
+            match = re.match(r"model\.layers\.(\d+)\.(.*)", name)
+            if match:
+                layer_idx = int(match.group(1))
+                weight_name = match.group(2)
+                
+                if weight_name == "input_layernorm.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_norm_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.q_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_q_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.q_proj.bias":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_q_b[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.k_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_k_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.k_proj.bias":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_k_b[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.v_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_v_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.v_proj.bias":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_v_b[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "self_attn.o_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.attn_o_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "post_attention_layernorm.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.mlp_norm_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "mlp.gate_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.mlp_gate_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "mlp.up_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.mlp_up_w[layer_idx], ctypes.c_void_p(data_ptr))
+                elif weight_name == "mlp.down_proj.weight":
+                    LIB_LLAISYS.tensorLoad(self._weights.mlp_down_w[layer_idx], ctypes.c_void_p(data_ptr))
+
+    def _slice_weight(self, name: str, tensor):
+        if self.world_size == 1:
+            return tensor
+
+        def shard_dim(t, dim):
+            assert t.shape[dim] % self.world_size == 0
+            shard = t.shape[dim] // self.world_size
+            start = self.rank * shard
+            end = start + shard
+            if dim == 0:
+                return t[start:end]
+            if dim == 1:
+                return t[:, start:end]
+            raise ValueError(f"Unsupported shard dim: {dim}")
+
+        if name in {"model.embed_tokens.weight", "lm_head.weight"}:
+            return shard_dim(tensor, 0)
+        if name in {"model.norm.weight"}:
+            return tensor
+
+        match = re.match(r"model\.layers\.(\d+)\.(.*)", name)
+        if not match:
+            return tensor
+
+        weight_name = match.group(2)
+        if weight_name in {
+            "input_layernorm.weight",
+            "post_attention_layernorm.weight",
+        }:
+            return tensor
+        if weight_name in {
+            "self_attn.q_proj.weight",
+            "self_attn.k_proj.weight",
+            "self_attn.v_proj.weight",
+            "mlp.gate_proj.weight",
+            "mlp.up_proj.weight",
+        }:
+            return shard_dim(tensor, 0)
+        if weight_name in {
+            "self_attn.q_proj.bias",
+            "self_attn.k_proj.bias",
+            "self_attn.v_proj.bias",
+        }:
+            assert tensor.shape[0] % self.world_size == 0
+            shard = tensor.shape[0] // self.world_size
+            start = self.rank * shard
+            end = start + shard
+            return tensor[start:end]
+        if weight_name in {
+            "self_attn.o_proj.weight",
+            "mlp.down_proj.weight",
+        }:
+            return shard_dim(tensor, 1)
+        return tensor
 
     def generate(
         self,
@@ -26,8 +206,36 @@ def generate(
         top_k: int = 1,
         top_p: float = 0.8,
         temperature: float = 0.8,
-    ):
-
-        # TODO: Implement generate function
-
-        return []
+    ) -> List[int]:
+        """Generate tokens using argmax sampling (for testing)."""
+        
+        if max_new_tokens is None:
+            max_new_tokens = 128
+        
+        # Convert inputs to list
+        tokens = list(inputs)
+        
+        # First inference with all input tokens
+        input_array = (ctypes.c_int64 * len(tokens))(*tokens)
+        next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
+            self._model,
+            input_array,
+            len(tokens)
+        )
+        tokens.append(next_token)
+        
+        # Generate remaining tokens one by one
+        for _ in range(max_new_tokens - 1):
+            if next_token == self.eos_token_id:
+                break
+            
+            # Inference with single token
+            input_array = (ctypes.c_int64 * 1)(next_token)
+            next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
+                self._model,
+                input_array,
+                1
+            )
+            tokens.append(next_token)
+        
+        return tokens
diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py
index ed0180bc8..997a268b8 100644
--- a/python/llaisys/ops.py
+++ b/python/llaisys/ops.py
@@ -21,7 +21,7 @@ def embedding(out: Tensor, index: Tensor, weight: Tensor):
     @staticmethod
     def linear(out: Tensor, inp: Tensor, weight: Tensor, bias: Tensor):
         LIB_LLAISYS.llaisysLinear(
-            out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), bias.lib_tensor()
+            out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), None if bias is None else bias.lib_tensor()
         )
 
     @staticmethod
diff --git a/python/llaisys/runtime.py b/python/llaisys/runtime.py
index 15be1aa17..d8f9d5a6f 100644
--- a/python/llaisys/runtime.py
+++ b/python/llaisys/runtime.py
@@ -1,6 +1,7 @@
 from . import libllaisys
 from .libllaisys import LIB_LLAISYS
 from ctypes import c_void_p
+from .tensor import Tensor
 
 
 class RuntimeAPI:
@@ -66,3 +67,32 @@ def memcpy_async(
         self._api.contents.memcpy_async(
             dst, src, size, libllaisys.llaisysMemcpyKind_t(kind), stream
         )
+
+
+class DistributedContext:
+    def init(self, rank: int, world_size: int) -> None:
+        LIB_LLAISYS.llaisysInitDistributed(rank, world_size)
+
+    def finalize(self) -> None:
+        LIB_LLAISYS.llaisysFinalizeDistributed()
+
+    def is_initialized(self) -> bool:
+        return bool(LIB_LLAISYS.llaisysDistributedIsInitialized())
+
+    def rank(self) -> int:
+        return int(LIB_LLAISYS.llaisysDistributedRank())
+
+    def world_size(self) -> int:
+        return int(LIB_LLAISYS.llaisysDistributedWorldSize())
+
+    def all_reduce(self, tensor: Tensor) -> None:
+        LIB_LLAISYS.llaisysDistAllReduce(tensor.lib_tensor())
+
+    def all_gather(self, tensor: Tensor) -> Tensor:
+        return Tensor(tensor=LIB_LLAISYS.llaisysDistAllGather(tensor.lib_tensor()))
+
+    def broadcast(self, tensor: Tensor, root: int = 0) -> None:
+        LIB_LLAISYS.llaisysDistBroadcast(tensor.lib_tensor(), root)
+
+    def barrier(self) -> None:
+        LIB_LLAISYS.llaisysDistBarrier()
diff --git a/src/core/context/context.cpp b/src/core/context/context.cpp
index 44894b9e7..418a9edef 100644
--- a/src/core/context/context.cpp
+++ b/src/core/context/context.cpp
@@ -1,5 +1,9 @@
 #include "context.hpp"
+#include "../../tensor/tensor.hpp"
 #include "../../utils.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "../../device/nvidia/nccl_context.hpp"
+#endif
 #include <thread>
 
 namespace llaisys::core {
@@ -32,6 +36,8 @@ Context::Context() {
 }
 
 Context::~Context() {
+    finalizeDistributed();
+
     // Destroy current runtime first.
     delete _current_runtime;
 
@@ -52,7 +58,7 @@ Context::~Context() {
 void Context::setDevice(llaisysDeviceType_t device_type, int device_id) {
     // If doest not match the current runtime.
     if (_current_runtime == nullptr || _current_runtime->deviceType() != device_type || _current_runtime->deviceId() != device_id) {
-        auto runtimes = _runtime_map[device_type];
+        auto &runtimes = _runtime_map[device_type];
         CHECK_ARGUMENT((size_t)device_id < runtimes.size() && device_id >= 0, "invalid device id");
         if (_current_runtime != nullptr) {
             _current_runtime->_deactivate();
@@ -70,6 +76,115 @@ Runtime &Context::runtime() {
     return *_current_runtime;
 }
 
+void Context::initDistributed(int rank, int world_size) {
+#ifdef ENABLE_NVIDIA_API
+    setDevice(LLAISYS_DEVICE_NVIDIA, rank);
+    _dist_context = std::make_unique<llaisys::device::nvidia::NcclContext>(rank, world_size, runtime().deviceId());
+    return;
+#else
+    (void)rank;
+    (void)world_size;
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
+void Context::finalizeDistributed() {
+#ifdef ENABLE_NVIDIA_API
+    _dist_context.reset();
+#endif
+}
+
+bool Context::distributedInitialized() const {
+#ifdef ENABLE_NVIDIA_API
+    return _dist_context != nullptr;
+#else
+    return false;
+#endif
+}
+
+int Context::distributedRank() const {
+#ifdef ENABLE_NVIDIA_API
+    ASSERT(_dist_context != nullptr, "Distributed context is not initialized.");
+    return _dist_context->rank();
+#else
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
+int Context::distributedWorldSize() const {
+#ifdef ENABLE_NVIDIA_API
+    ASSERT(_dist_context != nullptr, "Distributed context is not initialized.");
+    return _dist_context->worldSize();
+#else
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
+void Context::allReduce(const tensor_t &tensor) {
+    ASSERT(tensor != nullptr, "allReduce tensor must not be null.");
+    ASSERT(tensor->isContiguous(), "allReduce requires a contiguous tensor.");
+    ASSERT(tensor->deviceType() == LLAISYS_DEVICE_NVIDIA, "Distributed collectives currently support NVIDIA tensors only.");
+#ifdef ENABLE_NVIDIA_API
+    ASSERT(_dist_context != nullptr, "Distributed context is not initialized.");
+    setDevice(tensor->deviceType(), tensor->deviceId());
+    ASSERT(runtime().deviceId() == _dist_context->deviceId(), "Tensor device does not match distributed communicator device.");
+    _dist_context->allReduce(tensor->data(), tensor->numel(), tensor->dtype(), runtime().stream());
+    runtime().synchronize();
+#else
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
+tensor_t Context::allGather(const tensor_t &tensor) {
+    ASSERT(tensor != nullptr, "allGather tensor must not be null.");
+    ASSERT(tensor->isContiguous(), "allGather requires a contiguous tensor.");
+    ASSERT(tensor->deviceType() == LLAISYS_DEVICE_NVIDIA, "Distributed collectives currently support NVIDIA tensors only.");
+#ifdef ENABLE_NVIDIA_API
+    ASSERT(_dist_context != nullptr, "Distributed context is not initialized.");
+    setDevice(tensor->deviceType(), tensor->deviceId());
+    ASSERT(runtime().deviceId() == _dist_context->deviceId(), "Tensor device does not match distributed communicator device.");
+
+    std::vector<size_t> gathered_shape;
+    gathered_shape.reserve(tensor->ndim() + 1);
+    gathered_shape.push_back(static_cast<size_t>(_dist_context->worldSize()));
+    gathered_shape.insert(gathered_shape.end(), tensor->shape().begin(), tensor->shape().end());
+
+    auto gathered = llaisys::Tensor::create(gathered_shape, tensor->dtype(), tensor->deviceType(), tensor->deviceId());
+    _dist_context->allGather(tensor->data(), gathered->data(), tensor->numel(), tensor->dtype(), runtime().stream());
+    runtime().synchronize();
+    return gathered;
+#else
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
+void Context::broadcast(const tensor_t &tensor, int root) {
+    ASSERT(tensor != nullptr, "broadcast tensor must not be null.");
+    ASSERT(tensor->isContiguous(), "broadcast requires a contiguous tensor.");
+    ASSERT(tensor->deviceType() == LLAISYS_DEVICE_NVIDIA, "Distributed collectives currently support NVIDIA tensors only.");
+#ifdef ENABLE_NVIDIA_API
+    ASSERT(_dist_context != nullptr, "Distributed context is not initialized.");
+    setDevice(tensor->deviceType(), tensor->deviceId());
+    ASSERT(runtime().deviceId() == _dist_context->deviceId(), "Tensor device does not match distributed communicator device.");
+    _dist_context->broadcast(tensor->data(), tensor->numel(), tensor->dtype(), root, runtime().stream());
+    runtime().synchronize();
+#else
+    (void)root;
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
+void Context::barrier() {
+#ifdef ENABLE_NVIDIA_API
+    ASSERT(_dist_context != nullptr, "Distributed context is not initialized.");
+    setDevice(LLAISYS_DEVICE_NVIDIA, _dist_context->deviceId());
+    _dist_context->barrier(runtime().stream());
+    runtime().synchronize();
+#else
+    EXCEPTION_UNSUPPORTED_DEVICE;
+#endif
+}
+
 // Global API to get thread-local context.
 Context &context() {
     thread_local Context thread_context;
diff --git a/src/core/context/context.hpp b/src/core/context/context.hpp
index a3ebcdecf..7804d2b4c 100644
--- a/src/core/context/context.hpp
+++ b/src/core/context/context.hpp
@@ -6,14 +6,29 @@
 
 #include "../runtime/runtime.hpp"
 
+#include <memory>
 #include <unordered_map>
 #include <vector>
 
+namespace llaisys {
+class Tensor;
+using tensor_t = std::shared_ptr<Tensor>;
+}
+
+#ifdef ENABLE_NVIDIA_API
+namespace llaisys::device::nvidia {
+class NcclContext;
+}
+#endif
+
 namespace llaisys::core {
 class Context {
 private:
     std::unordered_map<llaisysDeviceType_t, std::vector<Runtime *>> _runtime_map;
-    Runtime *_current_runtime;
+    Runtime *_current_runtime = nullptr;
+#ifdef ENABLE_NVIDIA_API
+    std::unique_ptr<llaisys::device::nvidia::NcclContext> _dist_context;
+#endif
     Context();
 
 public:
@@ -30,6 +45,17 @@ class Context {
     void setDevice(llaisysDeviceType_t device_type, int device_id);
     Runtime &runtime();
 
+    void initDistributed(int rank, int world_size);
+    void finalizeDistributed();
+    bool distributedInitialized() const;
+    int distributedRank() const;
+    int distributedWorldSize() const;
+
+    void allReduce(const tensor_t &tensor);
+    tensor_t allGather(const tensor_t &tensor);
+    void broadcast(const tensor_t &tensor, int root);
+    void barrier();
+
     friend Context &context();
 };
 } // namespace llaisys::core
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index 7f03a8622..f9b28248b 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -7,6 +7,7 @@ namespace llaisys::core {
 Runtime::Runtime(llaisysDeviceType_t device_type, int device_id)
     : _device_type(device_type), _device_id(device_id), _is_active(false) {
     _api = llaisys::device::getRuntimeAPI(_device_type);
+    _api->set_device(_device_id);
     _stream = _api->create_stream();
     _allocator = new allocators::NaiveAllocator(_api);
 }
diff --git a/src/device/nvidia/cuda_utils.cuh b/src/device/nvidia/cuda_utils.cuh
new file mode 100644
index 000000000..e9fcf762b
--- /dev/null
+++ b/src/device/nvidia/cuda_utils.cuh
@@ -0,0 +1,109 @@
+#pragma once
+
+#include "../../utils.hpp"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <nccl.h>
+
+#include <iostream>
+#include <stdexcept>
+
+namespace llaisys::device::nvidia {
+
+#define CUDA_CHECK(EXPR__)                                                                    \
+    do {                                                                                      \
+        cudaError_t err__ = (EXPR__);                                                         \
+        if (err__ != cudaSuccess) {                                                           \
+            std::cerr << "[ERROR] CUDA call failed: " << cudaGetErrorString(err__) << " ("   \
+                      << static_cast<int>(err__) << ")" << EXCEPTION_LOCATION_MSG << std::endl; \
+            throw std::runtime_error(cudaGetErrorString(err__));                              \
+        }                                                                                     \
+    } while (0)
+
+#define NCCL_CHECK(EXPR__)                                                                        \
+    do {                                                                                           \
+        ncclResult_t err__ = (EXPR__);                                                             \
+        if (err__ != ncclSuccess) {                                                                \
+            std::cerr << "[ERROR] NCCL call failed: " << ncclGetErrorString(err__) << " ("        \
+                      << static_cast<int>(err__) << ")" << EXCEPTION_LOCATION_MSG << std::endl;    \
+            throw std::runtime_error(ncclGetErrorString(err__));                                   \
+        }                                                                                          \
+    } while (0)
+
+inline cudaMemcpyKind toCudaMemcpyKind(llaisysMemcpyKind_t kind) {
+    switch (kind) {
+    case LLAISYS_MEMCPY_H2H:
+        return cudaMemcpyHostToHost;
+    case LLAISYS_MEMCPY_H2D:
+        return cudaMemcpyHostToDevice;
+    case LLAISYS_MEMCPY_D2H:
+        return cudaMemcpyDeviceToHost;
+    case LLAISYS_MEMCPY_D2D:
+        return cudaMemcpyDeviceToDevice;
+    default:
+        CHECK_ARGUMENT(false, "invalid memcpy kind");
+        return cudaMemcpyDefault;
+    }
+}
+
+inline cudaStream_t toCudaStream(llaisysStream_t stream) {
+    return reinterpret_cast<cudaStream_t>(stream);
+}
+
+template <typename T>
+__device__ inline float scalarToFloat(T value);
+
+template <>
+__device__ inline float scalarToFloat<float>(float value) {
+    return value;
+}
+
+template <>
+__device__ inline float scalarToFloat<llaisys::fp16_t>(llaisys::fp16_t value) {
+    union {
+        uint16_t u;
+        __half h;
+    } caster{value._v};
+    return __half2float(caster.h);
+}
+
+template <>
+__device__ inline float scalarToFloat<llaisys::bf16_t>(llaisys::bf16_t value) {
+    union {
+        uint16_t u;
+        __nv_bfloat16 b;
+    } caster{value._v};
+    return __bfloat162float(caster.b);
+}
+
+template <typename T>
+__device__ inline T floatToScalar(float value);
+
+template <>
+__device__ inline float floatToScalar<float>(float value) {
+    return value;
+}
+
+template <>
+__device__ inline llaisys::fp16_t floatToScalar<llaisys::fp16_t>(float value) {
+    union {
+        uint16_t u;
+        __half h;
+    } caster;
+    caster.h = __float2half_rn(value);
+    return llaisys::fp16_t{caster.u};
+}
+
+template <>
+__device__ inline llaisys::bf16_t floatToScalar<llaisys::bf16_t>(float value) {
+    union {
+        uint16_t u;
+        __nv_bfloat16 b;
+    } caster;
+    caster.b = __float2bfloat16(value);
+    return llaisys::bf16_t{caster.u};
+}
+
+} // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nccl_context.cu b/src/device/nvidia/nccl_context.cu
new file mode 100644
index 000000000..136545cec
--- /dev/null
+++ b/src/device/nvidia/nccl_context.cu
@@ -0,0 +1,221 @@
+#include "nccl_context.hpp"
+#include "cuda_utils.cuh"
+
+#include <chrono>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unistd.h>
+
+namespace llaisys::device::nvidia {
+namespace {
+
+std::string sanitizeForPath(const std::string &value) {
+    std::string sanitized = value;
+    for (char &ch : sanitized) {
+        const bool keep = (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch == '-' || ch == '_';
+        if (!keep) {
+            ch = '_';
+        }
+    }
+    return sanitized;
+}
+
+std::string envOrEmpty(const char *name) {
+    const char *value = std::getenv(name);
+    if (value == nullptr) {
+        return {};
+    }
+    return value;
+}
+
+} // namespace
+
+ncclDataType_t toNcclDataType(llaisysDataType_t dtype) {
+    switch (dtype) {
+    case LLAISYS_DTYPE_BOOL:
+        return ncclUint8;
+    case LLAISYS_DTYPE_I8:
+        return ncclInt8;
+    case LLAISYS_DTYPE_I32:
+        return ncclInt32;
+    case LLAISYS_DTYPE_I64:
+        return ncclInt64;
+    case LLAISYS_DTYPE_U8:
+    case LLAISYS_DTYPE_BYTE:
+        return ncclUint8;
+    case LLAISYS_DTYPE_U32:
+        return ncclUint32;
+    case LLAISYS_DTYPE_U64:
+        return ncclUint64;
+    case LLAISYS_DTYPE_F16:
+        return ncclFloat16;
+    case LLAISYS_DTYPE_F32:
+        return ncclFloat32;
+    case LLAISYS_DTYPE_F64:
+        return ncclFloat64;
+    case LLAISYS_DTYPE_BF16:
+        return ncclBfloat16;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+    }
+}
+
+ncclRedOp_t toNcclReduceOp(NcclReduceOp op) {
+    switch (op) {
+    case NcclReduceOp::Sum:
+        return ncclSum;
+    default:
+        throw std::invalid_argument("Unsupported NCCL reduce op");
+    }
+}
+
+std::string resolveBootstrapPath() {
+    const std::string explicit_path = envOrEmpty("LLAISYS_DIST_BOOTSTRAP_PATH");
+    if (!explicit_path.empty()) {
+        return explicit_path;
+    }
+
+    const std::string dist_id = envOrEmpty("LLAISYS_DIST_ID");
+    if (!dist_id.empty()) {
+        return "/tmp/llaisys_nccl_" + sanitizeForPath(dist_id) + ".bin";
+    }
+
+    const std::string master_addr = envOrEmpty("MASTER_ADDR");
+    const std::string master_port = envOrEmpty("MASTER_PORT");
+    if (!master_addr.empty() && !master_port.empty()) {
+        return "/tmp/llaisys_nccl_" + sanitizeForPath(master_addr) + "_" + sanitizeForPath(master_port) + ".bin";
+    }
+
+    throw std::invalid_argument(
+        "Distributed NCCL init requires LLAISYS_DIST_BOOTSTRAP_PATH, LLAISYS_DIST_ID, or MASTER_ADDR/MASTER_PORT.");
+}
+
+void storeUniqueId(const std::string &path, const ncclUniqueId &unique_id) {
+    const std::filesystem::path file_path(path);
+    const std::filesystem::path dir_path = file_path.parent_path();
+    if (!dir_path.empty()) {
+        std::filesystem::create_directories(dir_path);
+    }
+
+    std::error_code remove_ec;
+    std::filesystem::remove(file_path, remove_ec);
+
+    std::ostringstream temp_path_builder;
+    temp_path_builder << path << ".tmp." << ::getpid();
+    const std::string temp_path = temp_path_builder.str();
+
+    {
+        std::ofstream output(temp_path, std::ios::binary | std::ios::trunc);
+        CHECK_ARGUMENT(output.good(), "failed to open NCCL bootstrap temp file");
+        output.write(reinterpret_cast<const char *>(&unique_id), sizeof(unique_id));
+        CHECK_ARGUMENT(output.good(), "failed to write NCCL bootstrap data");
+        output.flush();
+        CHECK_ARGUMENT(output.good(), "failed to flush NCCL bootstrap data");
+    }
+
+    std::filesystem::rename(temp_path, path);
+}
+
+ncclUniqueId loadUniqueId(int rank) {
+    const std::string path = resolveBootstrapPath();
+    ncclUniqueId unique_id{};
+    if (rank == 0) {
+        NCCL_CHECK(ncclGetUniqueId(&unique_id));
+        storeUniqueId(path, unique_id);
+        return unique_id;
+    }
+
+    constexpr int kMaxAttempts = 600;
+    constexpr auto kPollInterval = std::chrono::milliseconds(100);
+    constexpr auto kFreshWindow = std::chrono::seconds(5);
+    for (int attempt = 0; attempt < kMaxAttempts; ++attempt) {
+        std::error_code ec;
+        const bool exists = std::filesystem::exists(path, ec);
+        const bool size_ok = exists && !ec && std::filesystem::file_size(path, ec) == sizeof(unique_id);
+        const auto min_write_time = std::filesystem::file_time_type::clock::now() - kFreshWindow;
+        const bool fresh_enough = size_ok && !ec && std::filesystem::last_write_time(path, ec) >= min_write_time;
+        if (fresh_enough && !ec) {
+            std::ifstream input(path, std::ios::binary);
+            if (input.good()) {
+                input.read(reinterpret_cast<char *>(&unique_id), sizeof(unique_id));
+                if (input.good()) {
+                    return unique_id;
+                }
+            }
+        }
+        std::this_thread::sleep_for(kPollInterval);
+    }
+
+    throw std::runtime_error("Timed out waiting for NCCL bootstrap file");
+}
+
+NcclContext::NcclContext(int rank, int world_size, int device_id)
+    : _rank(rank),
+      _world_size(world_size),
+      _device_id(device_id),
+      _comm(nullptr),
+      _barrier_buffer(nullptr) {
+    CHECK_ARGUMENT(world_size > 0, "world_size must be positive");
+    CHECK_ARGUMENT(rank >= 0 && rank < world_size, "rank must be in [0, world_size)");
+    CHECK_ARGUMENT(device_id >= 0, "device_id must be non-negative");
+
+    CUDA_CHECK(cudaSetDevice(device_id));
+    const ncclUniqueId unique_id = loadUniqueId(rank);
+    NCCL_CHECK(ncclCommInitRank(&_comm, world_size, unique_id, rank));
+    CUDA_CHECK(cudaMalloc(&_barrier_buffer, sizeof(int32_t)));
+}
+
+NcclContext::~NcclContext() {
+    if (_barrier_buffer != nullptr) {
+        cudaFree(_barrier_buffer);
+        _barrier_buffer = nullptr;
+    }
+    if (_comm != nullptr) {
+        ncclCommDestroy(_comm);
+        _comm = nullptr;
+    }
+}
+
+int NcclContext::rank() const {
+    return _rank;
+}
+
+int NcclContext::worldSize() const {
+    return _world_size;
+}
+
+int NcclContext::deviceId() const {
+    return _device_id;
+}
+
+void NcclContext::allReduce(void *data, size_t count, llaisysDataType_t dtype, llaisysStream_t stream, NcclReduceOp op) const {
+    CHECK_ARGUMENT(data != nullptr || count == 0, "allReduce data must not be null");
+    std::lock_guard<std::mutex> lock(_mutex);
+    NCCL_CHECK(ncclAllReduce(data, data, count, toNcclDataType(dtype), toNcclReduceOp(op), _comm, toCudaStream(stream)));
+}
+
+void NcclContext::allGather(const void *send_buffer, void *recv_buffer, size_t count, llaisysDataType_t dtype, llaisysStream_t stream) const {
+    CHECK_ARGUMENT(send_buffer != nullptr || count == 0, "allGather send buffer must not be null");
+    CHECK_ARGUMENT(recv_buffer != nullptr || count == 0, "allGather recv buffer must not be null");
+    std::lock_guard<std::mutex> lock(_mutex);
+    NCCL_CHECK(ncclAllGather(send_buffer, recv_buffer, count, toNcclDataType(dtype), _comm, toCudaStream(stream)));
+}
+
+void NcclContext::broadcast(void *data, size_t count, llaisysDataType_t dtype, int root, llaisysStream_t stream) const {
+    CHECK_ARGUMENT(data != nullptr || count == 0, "broadcast data must not be null");
+    CHECK_ARGUMENT(root >= 0 && root < _world_size, "broadcast root must be in [0, world_size)");
+    std::lock_guard<std::mutex> lock(_mutex);
+    NCCL_CHECK(ncclBroadcast(data, data, count, toNcclDataType(dtype), root, _comm, toCudaStream(stream)));
+}
+
+void NcclContext::barrier(llaisysStream_t stream) const {
+    std::lock_guard<std::mutex> lock(_mutex);
+    CUDA_CHECK(cudaMemsetAsync(_barrier_buffer, 0, sizeof(int32_t), toCudaStream(stream)));
+    NCCL_CHECK(ncclAllReduce(_barrier_buffer, _barrier_buffer, 1, ncclInt32, ncclSum, _comm, toCudaStream(stream)));
+}
+
+} // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nccl_context.hpp b/src/device/nvidia/nccl_context.hpp
new file mode 100644
index 000000000..f46828b13
--- /dev/null
+++ b/src/device/nvidia/nccl_context.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <mutex>
+#include <string>
+
+struct ncclComm;
+typedef ncclComm *ncclComm_t;
+
+namespace llaisys::device::nvidia {
+
+enum class NcclReduceOp {
+    Sum = 0,
+};
+
+class NcclContext {
+private:
+    int _rank;
+    int _world_size;
+    int _device_id;
+    ncclComm_t _comm;
+    void *_barrier_buffer;
+    mutable std::mutex _mutex;
+
+public:
+    NcclContext(int rank, int world_size, int device_id);
+    ~NcclContext();
+
+    NcclContext(const NcclContext &) = delete;
+    NcclContext &operator=(const NcclContext &) = delete;
+    NcclContext(NcclContext &&) = delete;
+    NcclContext &operator=(NcclContext &&) = delete;
+
+    int rank() const;
+    int worldSize() const;
+    int deviceId() const;
+
+    void allReduce(void *data, size_t count, llaisysDataType_t dtype, llaisysStream_t stream, NcclReduceOp op = NcclReduceOp::Sum) const;
+    void allGather(const void *send_buffer, void *recv_buffer, size_t count, llaisysDataType_t dtype, llaisysStream_t stream) const;
+    void broadcast(void *data, size_t count, llaisysDataType_t dtype, int root, llaisysStream_t stream) const;
+    void barrier(llaisysStream_t stream) const;
+};
+
+} // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nvidia_resource.cu b/src/device/nvidia/nvidia_resource.cu
index 2e63647e5..8144b5922 100644
--- a/src/device/nvidia/nvidia_resource.cu
+++ b/src/device/nvidia/nvidia_resource.cu
@@ -1,7 +1,40 @@
 #include "nvidia_resource.cuh"
+#include "cuda_utils.cuh"
+
+#include <unordered_map>
 
 namespace llaisys::device::nvidia {
 
-Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {}
+Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id), _cublas_handle(nullptr) {
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cublasStatus_t status = cublasCreate(&_cublas_handle);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        throw std::runtime_error("Failed to create cuBLAS handle");
+    }
+}
+
+Resource::~Resource() {
+    if (_cublas_handle != nullptr) {
+        cublasDestroy(_cublas_handle);
+        _cublas_handle = nullptr;
+    }
+}
+
+cublasHandle_t Resource::cublasHandle() const {
+    return _cublas_handle;
+}
+
+Resource &resource(int device_id) {
+    static std::mutex resource_mutex;
+    static std::unordered_map<int, std::unique_ptr<Resource>> resources;
+
+    std::lock_guard<std::mutex> lock(resource_mutex);
+    auto it = resources.find(device_id);
+    if (it == resources.end()) {
+        auto inserted = resources.emplace(device_id, std::make_unique<Resource>(device_id));
+        it = inserted.first;
+    }
+    return *it->second;
+}
 
 } // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nvidia_resource.cuh b/src/device/nvidia/nvidia_resource.cuh
index a3002170b..085320c03 100644
--- a/src/device/nvidia/nvidia_resource.cuh
+++ b/src/device/nvidia/nvidia_resource.cuh
@@ -1,11 +1,23 @@
 #pragma once
 
 #include "../device_resource.hpp"
+#include <cublas_v2.h>
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
 
 namespace llaisys::device::nvidia {
 class Resource : public llaisys::device::DeviceResource {
+private:
+    cublasHandle_t _cublas_handle;
+
 public:
     Resource(int device_id);
     ~Resource();
+
+    cublasHandle_t cublasHandle() const;
 };
+
+Resource &resource(int device_id);
 } // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu
index cab928261..2cdd7ae46 100644
--- a/src/device/nvidia/nvidia_runtime_api.cu
+++ b/src/device/nvidia/nvidia_runtime_api.cu
@@ -1,56 +1,98 @@
 #include "../runtime_api.hpp"
 
-#include <cstdlib>
 #include <cstring>
+#include "cuda_utils.cuh"
 
 namespace llaisys::device::nvidia {
 
 namespace runtime_api {
 int getDeviceCount() {
-    TO_BE_IMPLEMENTED();
+    int count = 0;
+    CUDA_CHECK(cudaGetDeviceCount(&count));
+    return count;
 }
 
-void setDevice(int) {
-    TO_BE_IMPLEMENTED();
+void setDevice(int device_id) {
+    CUDA_CHECK(cudaSetDevice(device_id));
 }
 
 void deviceSynchronize() {
-    TO_BE_IMPLEMENTED();
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 llaisysStream_t createStream() {
-    TO_BE_IMPLEMENTED();
+    cudaStream_t stream = nullptr;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    return reinterpret_cast<llaisysStream_t>(stream);
 }
 
 void destroyStream(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
+    if (stream == nullptr) {
+        return;
+    }
+    CUDA_CHECK(cudaStreamDestroy(toCudaStream(stream)));
 }
+
 void streamSynchronize(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
+    if (stream == nullptr) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        return;
+    }
+    CUDA_CHECK(cudaStreamSynchronize(toCudaStream(stream)));
 }
 
 void *mallocDevice(size_t size) {
-    TO_BE_IMPLEMENTED();
+    if (size == 0) {
+        return nullptr;
+    }
+    void *ptr = nullptr;
+    CUDA_CHECK(cudaMalloc(&ptr, size));
+    return ptr;
 }
 
 void freeDevice(void *ptr) {
-    TO_BE_IMPLEMENTED();
+    if (ptr == nullptr) {
+        return;
+    }
+    CUDA_CHECK(cudaFree(ptr));
 }
 
 void *mallocHost(size_t size) {
-    TO_BE_IMPLEMENTED();
+    if (size == 0) {
+        return nullptr;
+    }
+    void *ptr = nullptr;
+    CUDA_CHECK(cudaMallocHost(&ptr, size));
+    return ptr;
 }
 
 void freeHost(void *ptr) {
-    TO_BE_IMPLEMENTED();
+    if (ptr == nullptr) {
+        return;
+    }
+    CUDA_CHECK(cudaFreeHost(ptr));
 }
 
 void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
+    if (size == 0) {
+        return;
+    }
+    if (kind == LLAISYS_MEMCPY_H2H) {
+        std::memcpy(dst, src, size);
+        return;
+    }
+    CUDA_CHECK(cudaMemcpy(dst, src, size, toCudaMemcpyKind(kind)));
 }
 
-void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
+void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) {
+    if (size == 0) {
+        return;
+    }
+    if (kind == LLAISYS_MEMCPY_H2H) {
+        std::memcpy(dst, src, size);
+        return;
+    }
+    CUDA_CHECK(cudaMemcpyAsync(dst, src, size, toCudaMemcpyKind(kind), toCudaStream(stream)));
 }
 
 static const LlaisysRuntimeAPI RUNTIME_API = {
diff --git a/src/llaisys/models.cc b/src/llaisys/models.cc
new file mode 100644
index 000000000..b19ca1723
--- /dev/null
+++ b/src/llaisys/models.cc
@@ -0,0 +1,128 @@
+#include "llaisys/models/qwen2.h"
+
+#include "llaisys_tensor.hpp"
+#include "../models/qwen2/qwen2.hpp"
+
+__C {
+
+struct LlaisysQwen2Model {
+    llaisys::models::Qwen2Model* model;
+    LlaisysQwen2Weights weights;
+};
+
+__export struct LlaisysQwen2Model* llaisysQwen2ModelCreate(
+    const LlaisysQwen2Meta* meta,
+    llaisysDeviceType_t device,
+    int* device_ids,
+    int ndevice) {
+    
+    // Create config from meta
+    llaisys::models::Qwen2Config config;
+    config.dtype = meta->dtype;
+    config.num_layers = meta->nlayer;
+    config.hidden_size = meta->hs;
+    config.num_heads = meta->nh;
+    config.num_kv_heads = meta->nkvh;
+    config.head_dim = meta->dh;
+    config.intermediate_size = meta->di;
+    config.max_seq_len = meta->maxseq;
+    config.vocab_size = meta->voc;
+    config.rms_norm_eps = meta->epsilon;
+    config.rope_theta = meta->theta;
+    config.eos_token_id = meta->end_token;
+    
+    // For now, use first device
+    int device_id = (ndevice > 0 && device_ids != nullptr) ? device_ids[0] : 0;
+    
+    auto* wrapper = new LlaisysQwen2Model();
+    wrapper->model = new llaisys::models::Qwen2Model(config, device, device_id);
+    
+    // Set up weight pointers
+    auto& model = *wrapper->model;
+    wrapper->weights.in_embed = new LlaisysTensor{model.embedTokens()};
+    wrapper->weights.out_embed = new LlaisysTensor{model.lmHead()};
+    wrapper->weights.out_norm_w = new LlaisysTensor{model.normWeight()};
+    
+    size_t nlayer = config.num_layers;
+    wrapper->weights.attn_norm_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_q_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_q_b = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_k_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_k_b = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_v_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_v_b = new llaisysTensor_t[nlayer];
+    wrapper->weights.attn_o_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.mlp_norm_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.mlp_gate_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.mlp_up_w = new llaisysTensor_t[nlayer];
+    wrapper->weights.mlp_down_w = new llaisysTensor_t[nlayer];
+    
+    for (size_t i = 0; i < nlayer; ++i) {
+        wrapper->weights.attn_norm_w[i] = new LlaisysTensor{model.inputLayernormWeight(i)};
+        wrapper->weights.attn_q_w[i] = new LlaisysTensor{model.qProjWeight(i)};
+        wrapper->weights.attn_q_b[i] = new LlaisysTensor{model.qProjBias(i)};
+        wrapper->weights.attn_k_w[i] = new LlaisysTensor{model.kProjWeight(i)};
+        wrapper->weights.attn_k_b[i] = new LlaisysTensor{model.kProjBias(i)};
+        wrapper->weights.attn_v_w[i] = new LlaisysTensor{model.vProjWeight(i)};
+        wrapper->weights.attn_v_b[i] = new LlaisysTensor{model.vProjBias(i)};
+        wrapper->weights.attn_o_w[i] = new LlaisysTensor{model.oProjWeight(i)};
+        wrapper->weights.mlp_norm_w[i] = new LlaisysTensor{model.postAttnLayernormWeight(i)};
+        wrapper->weights.mlp_gate_w[i] = new LlaisysTensor{model.gateProjWeight(i)};
+        wrapper->weights.mlp_up_w[i] = new LlaisysTensor{model.upProjWeight(i)};
+        wrapper->weights.mlp_down_w[i] = new LlaisysTensor{model.downProjWeight(i)};
+    }
+    
+    return wrapper;
+}
+
+__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model* model) {
+    if (model == nullptr) return;
+    
+    size_t nlayer = model->model->config().num_layers;
+    
+    // Delete weight wrappers
+    delete model->weights.in_embed;
+    delete model->weights.out_embed;
+    delete model->weights.out_norm_w;
+    
+    for (size_t i = 0; i < nlayer; ++i) {
+        delete model->weights.attn_norm_w[i];
+        delete model->weights.attn_q_w[i];
+        delete model->weights.attn_q_b[i];
+        delete model->weights.attn_k_w[i];
+        delete model->weights.attn_k_b[i];
+        delete model->weights.attn_v_w[i];
+        delete model->weights.attn_v_b[i];
+        delete model->weights.attn_o_w[i];
+        delete model->weights.mlp_norm_w[i];
+        delete model->weights.mlp_gate_w[i];
+        delete model->weights.mlp_up_w[i];
+        delete model->weights.mlp_down_w[i];
+    }
+    
+    delete[] model->weights.attn_norm_w;
+    delete[] model->weights.attn_q_w;
+    delete[] model->weights.attn_q_b;
+    delete[] model->weights.attn_k_w;
+    delete[] model->weights.attn_k_b;
+    delete[] model->weights.attn_v_w;
+    delete[] model->weights.attn_v_b;
+    delete[] model->weights.attn_o_w;
+    delete[] model->weights.mlp_norm_w;
+    delete[] model->weights.mlp_gate_w;
+    delete[] model->weights.mlp_up_w;
+    delete[] model->weights.mlp_down_w;
+    
+    delete model->model;
+    delete model;
+}
+
+__export struct LlaisysQwen2Weights* llaisysQwen2ModelWeights(struct LlaisysQwen2Model* model) {
+    return &model->weights;
+}
+
+__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model* model, int64_t* token_ids, size_t ntoken) {
+    return model->model->infer(token_ids, ntoken);
+}
+
+}
diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc
index c99fbc32f..3a1a8a7c4 100644
--- a/src/llaisys/ops.cc
+++ b/src/llaisys/ops.cc
@@ -23,7 +23,7 @@ __C {
         llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor);
     }
     void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias) {
-        llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor);
+        llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias == nullptr ? nullptr : bias->tensor);
     }
     void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in) {
         llaisys::ops::rearrange(out->tensor, in->tensor);
diff --git a/src/llaisys/runtime.cc b/src/llaisys/runtime.cc
index 7b00ff1bb..7ef917c0d 100644
--- a/src/llaisys/runtime.cc
+++ b/src/llaisys/runtime.cc
@@ -1,4 +1,5 @@
 #include "llaisys/runtime.h"
+#include "llaisys_tensor.hpp"
 #include "../core/context/context.hpp"
 #include "../device/runtime_api.hpp"
 
@@ -10,4 +11,40 @@ __C void llaisysSetContextRuntime(llaisysDeviceType_t device_type, int device_id
 // Llaisys API for getting the runtime APIs
 __C const LlaisysRuntimeAPI *llaisysGetRuntimeAPI(llaisysDeviceType_t device_type) {
     return llaisys::device::getRuntimeAPI(device_type);
+}
+
+__C void llaisysInitDistributed(int rank, int world_size) {
+    llaisys::core::context().initDistributed(rank, world_size);
+}
+
+__C void llaisysFinalizeDistributed() {
+    llaisys::core::context().finalizeDistributed();
+}
+
+__C uint8_t llaisysDistributedIsInitialized() {
+    return static_cast<uint8_t>(llaisys::core::context().distributedInitialized());
+}
+
+__C int llaisysDistributedRank() {
+    return llaisys::core::context().distributedRank();
+}
+
+__C int llaisysDistributedWorldSize() {
+    return llaisys::core::context().distributedWorldSize();
+}
+
+__C void llaisysDistAllReduce(llaisysTensor_t tensor) {
+    llaisys::core::context().allReduce(tensor->tensor);
+}
+
+__C llaisysTensor_t llaisysDistAllGather(llaisysTensor_t tensor) {
+    return new LlaisysTensor{llaisys::core::context().allGather(tensor->tensor)};
+}
+
+__C void llaisysDistBroadcast(llaisysTensor_t tensor, int root) {
+    llaisys::core::context().broadcast(tensor->tensor, root);
+}
+
+__C void llaisysDistBarrier() {
+    llaisys::core::context().barrier();
 }
\ No newline at end of file
diff --git a/src/models/qwen2/qwen2.cpp b/src/models/qwen2/qwen2.cpp
new file mode 100644
index 000000000..4cacea842
--- /dev/null
+++ b/src/models/qwen2/qwen2.cpp
@@ -0,0 +1,288 @@
+#include "qwen2.hpp"
+#include "../../utils.hpp"
+
+namespace llaisys::models {
+namespace {
+
+std::pair<size_t, size_t> evenShardRange(size_t total, size_t rank, size_t world_size) {
+    ASSERT(world_size > 0, "Tensor parallel world size must be positive.");
+    ASSERT(total % world_size == 0, "Tensor parallel shard requires evenly divisible dimension.");
+    const size_t shard = total / world_size;
+    return {rank * shard, (rank + 1) * shard};
+}
+
+tensor_t sliceColumns(const tensor_t &tensor, size_t start, size_t end) {
+    ASSERT(tensor->ndim() == 2, "sliceColumns expects a 2D tensor.");
+    return tensor->slice(1, start, end);
+}
+
+tensor_t makeContiguous(const tensor_t &tensor) {
+    if (tensor->isContiguous()) {
+        return tensor;
+    }
+    auto out = Tensor::create(tensor->shape(), tensor->dtype(), tensor->deviceType(), tensor->deviceId());
+    ops::rearrange(out, tensor);
+    return out;
+}
+
+float readScalarAsFloat(const std::byte *data, llaisysDataType_t dtype, size_t index) {
+    switch (dtype) {
+    case LLAISYS_DTYPE_F32:
+        return reinterpret_cast<const float *>(data)[index];
+    case LLAISYS_DTYPE_F16:
+        return llaisys::utils::cast<float>(reinterpret_cast<const llaisys::fp16_t *>(data)[index]);
+    case LLAISYS_DTYPE_BF16:
+        return llaisys::utils::cast<float>(reinterpret_cast<const llaisys::bf16_t *>(data)[index]);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+    }
+}
+
+} // namespace
+
+Qwen2Model::Qwen2Model(const Qwen2Config& config, llaisysDeviceType_t device_type, int device_id)
+    : config_(config),
+      device_type_(device_type),
+      device_id_(device_id),
+      tp_rank_(0),
+      tp_world_size_(1),
+      hidden_shard_size_(0),
+      kv_hidden_size_(0),
+      kv_shard_size_(0),
+      intermediate_shard_size_(0),
+      vocab_shard_size_(0),
+      vocab_start_(0),
+      vocab_end_(0),
+      cache_len_(0),
+      buffer_seq_len_(0) {
+    if (device_type_ == LLAISYS_DEVICE_NVIDIA && core::context().distributedInitialized()) {
+        tp_rank_ = static_cast<size_t>(core::context().distributedRank());
+        tp_world_size_ = static_cast<size_t>(core::context().distributedWorldSize());
+    }
+    
+    const size_t nl = config_.num_layers;
+    const size_t hs = config_.hidden_size;
+    kv_hidden_size_ = config_.num_kv_heads * config_.head_dim;
+    ASSERT(hs % tp_world_size_ == 0, "Tensor parallel hidden size must be divisible by world size.");
+    ASSERT(kv_hidden_size_ % tp_world_size_ == 0, "Tensor parallel KV size must be divisible by world size.");
+    ASSERT(config_.intermediate_size % tp_world_size_ == 0, "Tensor parallel intermediate size must be divisible by world size.");
+    ASSERT(config_.vocab_size % tp_world_size_ == 0, "Tensor parallel vocab size must be divisible by world size.");
+    hidden_shard_size_ = hs / tp_world_size_;
+    kv_shard_size_ = kv_hidden_size_ / tp_world_size_;
+    intermediate_shard_size_ = config_.intermediate_size / tp_world_size_;
+    vocab_shard_size_ = config_.vocab_size / tp_world_size_;
+    auto vocab_range = evenShardRange(config_.vocab_size, tp_rank_, tp_world_size_);
+    vocab_start_ = vocab_range.first;
+    vocab_end_ = vocab_range.second;
+    
+    // Allocate embedding weights
+    embed_tokens_ = Tensor::create({vocab_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+    lm_head_ = Tensor::create({vocab_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+    norm_weight_ = Tensor::create({hs}, config_.dtype, device_type_, device_id_);
+    
+    // Allocate per-layer weights
+    input_layernorm_weight_.resize(nl);
+    q_proj_weight_.resize(nl);
+    q_proj_bias_.resize(nl);
+    k_proj_weight_.resize(nl);
+    k_proj_bias_.resize(nl);
+    v_proj_weight_.resize(nl);
+    v_proj_bias_.resize(nl);
+    o_proj_weight_.resize(nl);
+    post_attn_layernorm_weight_.resize(nl);
+    gate_proj_weight_.resize(nl);
+    up_proj_weight_.resize(nl);
+    down_proj_weight_.resize(nl);
+    
+    for (size_t i = 0; i < nl; ++i) {
+        input_layernorm_weight_[i] = Tensor::create({hs}, config_.dtype, device_type_, device_id_);
+        q_proj_weight_[i] = Tensor::create({hidden_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+        q_proj_bias_[i] = Tensor::create({hidden_shard_size_}, config_.dtype, device_type_, device_id_);
+        k_proj_weight_[i] = Tensor::create({kv_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+        k_proj_bias_[i] = Tensor::create({kv_shard_size_}, config_.dtype, device_type_, device_id_);
+        v_proj_weight_[i] = Tensor::create({kv_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+        v_proj_bias_[i] = Tensor::create({kv_shard_size_}, config_.dtype, device_type_, device_id_);
+        o_proj_weight_[i] = Tensor::create({hs, hidden_shard_size_}, config_.dtype, device_type_, device_id_);
+        post_attn_layernorm_weight_[i] = Tensor::create({hs}, config_.dtype, device_type_, device_id_);
+        gate_proj_weight_[i] = Tensor::create({intermediate_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+        up_proj_weight_[i] = Tensor::create({intermediate_shard_size_, hs}, config_.dtype, device_type_, device_id_);
+        down_proj_weight_[i] = Tensor::create({hs, intermediate_shard_size_}, config_.dtype, device_type_, device_id_);
+    }
+    
+    // Allocate KV cache
+    allocateKVCache();
+}
+
+void Qwen2Model::allocateKVCache() {
+    const size_t nl = config_.num_layers;
+    const size_t maxseq = config_.max_seq_len;
+    
+    k_cache_.resize(nl);
+    v_cache_.resize(nl);
+    
+    for (size_t i = 0; i < nl; ++i) {
+        k_cache_[i] = Tensor::create({maxseq, kv_shard_size_}, config_.dtype, device_type_, device_id_);
+        v_cache_[i] = Tensor::create({maxseq, kv_shard_size_}, config_.dtype, device_type_, device_id_);
+    }
+}
+
+void Qwen2Model::allocateBuffers(size_t seq_len) {
+    if (buffer_seq_len_ == seq_len && hidden_states_ != nullptr) {
+        return;
+    }
+
+    const size_t hs = config_.hidden_size;
+    
+    hidden_states_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_);
+    normed_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_);
+    q_local_ = Tensor::create({seq_len, hidden_shard_size_}, config_.dtype, device_type_, device_id_);
+    k_local_ = Tensor::create({seq_len, kv_shard_size_}, config_.dtype, device_type_, device_id_);
+    v_local_ = Tensor::create({seq_len, kv_shard_size_}, config_.dtype, device_type_, device_id_);
+    attn_proj_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_);
+    gate_ = Tensor::create({seq_len, intermediate_shard_size_}, config_.dtype, device_type_, device_id_);
+    up_ = Tensor::create({seq_len, intermediate_shard_size_}, config_.dtype, device_type_, device_id_);
+    mlp_out_ = Tensor::create({seq_len, intermediate_shard_size_}, config_.dtype, device_type_, device_id_);
+    down_ = Tensor::create({seq_len, hs}, config_.dtype, device_type_, device_id_);
+    logits_local_ = Tensor::create({1, vocab_shard_size_}, config_.dtype, device_type_, device_id_);
+    pos_ids_ = Tensor::create({seq_len}, LLAISYS_DTYPE_I64, device_type_, device_id_);
+    input_ids_ = Tensor::create({seq_len}, LLAISYS_DTYPE_I64, device_type_, device_id_);
+    max_idx_ = Tensor::create({1}, LLAISYS_DTYPE_I64, device_type_, device_id_);
+    max_val_ = Tensor::create({1}, config_.dtype, device_type_, device_id_);
+    buffer_seq_len_ = seq_len;
+}
+
+int64_t Qwen2Model::infer(const int64_t* token_ids, size_t num_tokens) {
+    const size_t nh = config_.num_heads;
+    const size_t dh = config_.head_dim;
+    const size_t nl = config_.num_layers;
+    const float eps = config_.rms_norm_eps;
+    const float theta = config_.rope_theta;
+    const float scale = 1.0f / std::sqrt(static_cast<float>(dh));
+    
+    // Allocate buffers for this sequence length
+    allocateBuffers(num_tokens);
+    
+    // Create input token tensor
+    input_ids_->load(token_ids);
+    
+    // Create position ids: [cache_len, cache_len+1, ..., cache_len+num_tokens-1]
+    std::vector<int64_t> pos_ids_data(num_tokens);
+    for (size_t i = 0; i < num_tokens; ++i) {
+        pos_ids_data[i] = static_cast<int64_t>(cache_len_ + i);
+    }
+    pos_ids_->load(pos_ids_data.data());
+    
+    // Embedding lookup: hidden_states = embed_tokens[input_ids]
+    ops::parallelEmbedding(hidden_states_, input_ids_, embed_tokens_, vocab_start_, vocab_end_);
+    if (tp_world_size_ > 1) {
+        core::context().allReduce(hidden_states_);
+    }
+    
+    // Process each layer
+    for (size_t layer = 0; layer < nl; ++layer) {
+        // Input layernorm
+        ops::rms_norm(normed_, hidden_states_, input_layernorm_weight_[layer], eps);
+        
+        ops::columnParallelLinear(q_local_, normed_, q_proj_weight_[layer], q_proj_bias_[layer]);
+        ops::columnParallelLinear(k_local_, normed_, k_proj_weight_[layer], k_proj_bias_[layer]);
+        ops::columnParallelLinear(v_local_, normed_, v_proj_weight_[layer], v_proj_bias_[layer]);
+
+        auto q_full_flat = ops::gatherLastDim(q_local_);
+        auto k_full_flat = ops::gatherLastDim(k_local_);
+        auto q_full = q_full_flat->view({num_tokens, nh, dh});
+        auto k_full = k_full_flat->view({num_tokens, config_.num_kv_heads, dh});
+
+        ops::rope(q_full, q_full, pos_ids_, theta);
+        ops::rope(k_full, k_full, pos_ids_, theta);
+        
+        // Update KV cache
+        auto k_cache_slice = k_cache_[layer]->slice(0, cache_len_, cache_len_ + num_tokens);
+        auto v_cache_slice = v_cache_[layer]->slice(0, cache_len_, cache_len_ + num_tokens);
+        auto k_after_rope_flat = k_full->view({num_tokens, kv_hidden_size_});
+        auto k_local_after_rope = makeContiguous(
+            sliceColumns(k_after_rope_flat, tp_rank_ * kv_shard_size_, (tp_rank_ + 1) * kv_shard_size_));
+        ops::rearrange(k_cache_slice, k_local_after_rope);
+        ops::rearrange(v_cache_slice, v_local_);
+
+        auto k_full_cache = ops::gatherLastDim(k_cache_[layer]->slice(0, 0, cache_len_ + num_tokens));
+        auto v_full_cache = ops::gatherLastDim(v_cache_[layer]->slice(0, 0, cache_len_ + num_tokens));
+        auto k_attn = k_full_cache->view({cache_len_ + num_tokens, config_.num_kv_heads, dh});
+        auto v_attn = v_full_cache->view({cache_len_ + num_tokens, config_.num_kv_heads, dh});
+        
+        // Self attention
+        auto attn_out = Tensor::create({num_tokens, nh, dh}, config_.dtype, device_type_, device_id_);
+        ops::self_attention(attn_out, q_full, k_attn, v_attn, scale);
+        
+        // Output projection
+        auto attn_out_flat = attn_out->view({num_tokens, nh * dh});
+        auto attn_out_local = makeContiguous(
+            sliceColumns(attn_out_flat, tp_rank_ * hidden_shard_size_, (tp_rank_ + 1) * hidden_shard_size_));
+        ops::rowParallelLinear(attn_proj_, attn_out_local, o_proj_weight_[layer]);
+        
+        // Residual connection: hidden_states = hidden_states + attn_proj
+        ops::add(hidden_states_, hidden_states_, attn_proj_);
+        
+        // Post attention layernorm
+        ops::rms_norm(normed_, hidden_states_, post_attn_layernorm_weight_[layer], eps);
+        
+        // MLP: gate_proj, up_proj, swiglu, down_proj
+        ops::columnParallelLinear(gate_, normed_, gate_proj_weight_[layer], nullptr);
+        ops::columnParallelLinear(up_, normed_, up_proj_weight_[layer], nullptr);
+        ops::swiglu(mlp_out_, gate_, up_);
+        ops::rowParallelLinear(down_, mlp_out_, down_proj_weight_[layer]);
+        
+        // Residual connection: hidden_states = hidden_states + down
+        ops::add(hidden_states_, hidden_states_, down_);
+    }
+    
+    // Final layer norm
+    ops::rms_norm(normed_, hidden_states_, norm_weight_, eps);
+    
+    // Get last token's hidden state for prediction
+    auto last_hidden = normed_->slice(0, num_tokens - 1, num_tokens);  // [1, hs]
+    
+    auto& ctx = core::context();
+    ctx.setDevice(device_type_, device_id_);
+    auto& runtime = ctx.runtime();
+
+    // LM head local shard
+    ops::columnParallelLinear(logits_local_, last_hidden, lm_head_, nullptr);
+    auto logits_local_1d = logits_local_->view({vocab_shard_size_});
+    ops::argmax(max_idx_, max_val_, logits_local_1d);
+
+    int64_t local_idx = 0;
+    runtime.api()->memcpy_sync(&local_idx, max_idx_->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H);
+    local_idx += static_cast<int64_t>(vocab_start_);
+
+    int64_t next_token = local_idx;
+    if (tp_world_size_ > 1) {
+        auto local_global_idx = Tensor::create({1}, LLAISYS_DTYPE_I64, device_type_, device_id_);
+        local_global_idx->load(&local_idx);
+        auto gathered_values = ctx.allGather(max_val_);
+        auto gathered_indices = ctx.allGather(local_global_idx);
+
+        std::vector<std::byte> gathered_value_bytes(gathered_values->numel() * gathered_values->elementSize());
+        std::vector<int64_t> gathered_index_host(gathered_indices->numel());
+        runtime.api()->memcpy_sync(gathered_value_bytes.data(), gathered_values->data(), gathered_value_bytes.size(), LLAISYS_MEMCPY_D2H);
+        runtime.api()->memcpy_sync(gathered_index_host.data(), gathered_indices->data(),
+                                   gathered_index_host.size() * sizeof(int64_t), LLAISYS_MEMCPY_D2H);
+
+        float best_value = readScalarAsFloat(gathered_value_bytes.data(), config_.dtype, 0);
+        next_token = gathered_index_host[0];
+        for (size_t rank = 1; rank < tp_world_size_; ++rank) {
+            const float value = readScalarAsFloat(gathered_value_bytes.data(), config_.dtype, rank);
+            const int64_t index = gathered_index_host[rank];
+            if (value > best_value || (value == best_value && index < next_token)) {
+                best_value = value;
+                next_token = index;
+            }
+        }
+    }
+    
+    // Update cache length
+    cache_len_ += num_tokens;
+    
+    return next_token;
+}
+
+} // namespace llaisys::models
diff --git a/src/models/qwen2/qwen2.hpp b/src/models/qwen2/qwen2.hpp
new file mode 100644
index 000000000..a27ca690d
--- /dev/null
+++ b/src/models/qwen2/qwen2.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "../../tensor/tensor.hpp"
+#include "../../ops/add/op.hpp"
+#include "../../ops/argmax/op.hpp"
+#include "../../ops/embedding/op.hpp"
+#include "../../ops/linear/op.hpp"
+#include "../../ops/rearrange/op.hpp"
+#include "../../ops/rms_norm/op.hpp"
+#include "../../ops/rope/op.hpp"
+#include "../../ops/self_attention/op.hpp"
+#include "../../ops/swiglu/op.hpp"
+
+#include <vector>
+#include <cmath>
+
+namespace llaisys::models {
+
+struct Qwen2Config {
+    llaisysDataType_t dtype;
+    size_t num_layers;      // nlayer: 28
+    size_t hidden_size;     // hs: 1536
+    size_t num_heads;       // nh: 12
+    size_t num_kv_heads;    // nkvh: 2
+    size_t head_dim;        // dh: 128 (hidden_size / num_heads)
+    size_t intermediate_size; // di: 8960
+    size_t max_seq_len;     // maxseq: 131072
+    size_t vocab_size;      // voc: 151936
+    float rms_norm_eps;     // epsilon: 1e-6
+    float rope_theta;       // theta: 10000
+    int64_t eos_token_id;   // end_token: 151643
+};
+
+class Qwen2Model {
+private:
+    Qwen2Config config_;
+    llaisysDeviceType_t device_type_;
+    int device_id_;
+    size_t tp_rank_;
+    size_t tp_world_size_;
+    size_t hidden_shard_size_;
+    size_t kv_hidden_size_;
+    size_t kv_shard_size_;
+    size_t intermediate_shard_size_;
+    size_t vocab_shard_size_;
+    size_t vocab_start_;
+    size_t vocab_end_;
+    
+    // Weights
+    tensor_t embed_tokens_;     // [vocab_shard_size, hidden_size]
+    tensor_t lm_head_;          // [vocab_shard_size, hidden_size]
+    tensor_t norm_weight_;      // [hidden_size]
+    
+    // Per-layer weights
+    std::vector<tensor_t> input_layernorm_weight_;   // [hidden_size]
+    std::vector<tensor_t> q_proj_weight_;            // [hidden_shard_size, hidden_size]
+    std::vector<tensor_t> q_proj_bias_;              // [hidden_shard_size]
+    std::vector<tensor_t> k_proj_weight_;            // [kv_shard_size, hidden_size]
+    std::vector<tensor_t> k_proj_bias_;              // [kv_shard_size]
+    std::vector<tensor_t> v_proj_weight_;            // [kv_shard_size, hidden_size]
+    std::vector<tensor_t> v_proj_bias_;              // [kv_shard_size]
+    std::vector<tensor_t> o_proj_weight_;            // [hidden_size, hidden_shard_size]
+    std::vector<tensor_t> post_attn_layernorm_weight_; // [hidden_size]
+    std::vector<tensor_t> gate_proj_weight_;         // [intermediate_shard_size, hidden_size]
+    std::vector<tensor_t> up_proj_weight_;           // [intermediate_shard_size, hidden_size]
+    std::vector<tensor_t> down_proj_weight_;         // [hidden_size, intermediate_shard_size]
+    
+    // KV Cache: local shard per rank [num_layers][max_seq_len, kv_shard_size]
+    std::vector<tensor_t> k_cache_;
+    std::vector<tensor_t> v_cache_;
+    size_t cache_len_;  // Current cached sequence length
+    
+    // Intermediate buffers
+    tensor_t hidden_states_;    // [seq_len, hidden_size]
+    tensor_t normed_;           // [seq_len, hidden_size]
+    tensor_t q_local_;          // [seq_len, hidden_shard_size]
+    tensor_t k_local_;          // [seq_len, kv_shard_size]
+    tensor_t v_local_;          // [seq_len, kv_shard_size]
+    tensor_t attn_proj_;        // [seq_len, hidden_size]
+    tensor_t gate_;             // [seq_len, intermediate_shard_size]
+    tensor_t up_;               // [seq_len, intermediate_shard_size]
+    tensor_t mlp_out_;          // [seq_len, intermediate_shard_size]
+    tensor_t down_;             // [seq_len, hidden_size]
+    tensor_t logits_local_;     // [1, vocab_shard_size]
+    tensor_t pos_ids_;          // [seq_len]
+    tensor_t input_ids_;        // [seq_len]
+    tensor_t max_idx_;          // [1]
+    tensor_t max_val_;          // [1]
+    size_t buffer_seq_len_;
+    
+    void allocateBuffers(size_t seq_len);
+    void allocateKVCache();
+    
+public:
+    Qwen2Model(const Qwen2Config& config, llaisysDeviceType_t device_type, int device_id);
+    ~Qwen2Model() = default;
+    
+    // Weight accessors for loading
+    tensor_t& embedTokens() { return embed_tokens_; }
+    tensor_t& lmHead() { return lm_head_; }
+    tensor_t& normWeight() { return norm_weight_; }
+    tensor_t& inputLayernormWeight(size_t layer) { return input_layernorm_weight_[layer]; }
+    tensor_t& qProjWeight(size_t layer) { return q_proj_weight_[layer]; }
+    tensor_t& qProjBias(size_t layer) { return q_proj_bias_[layer]; }
+    tensor_t& kProjWeight(size_t layer) { return k_proj_weight_[layer]; }
+    tensor_t& kProjBias(size_t layer) { return k_proj_bias_[layer]; }
+    tensor_t& vProjWeight(size_t layer) { return v_proj_weight_[layer]; }
+    tensor_t& vProjBias(size_t layer) { return v_proj_bias_[layer]; }
+    tensor_t& oProjWeight(size_t layer) { return o_proj_weight_[layer]; }
+    tensor_t& postAttnLayernormWeight(size_t layer) { return post_attn_layernorm_weight_[layer]; }
+    tensor_t& gateProjWeight(size_t layer) { return gate_proj_weight_[layer]; }
+    tensor_t& upProjWeight(size_t layer) { return up_proj_weight_[layer]; }
+    tensor_t& downProjWeight(size_t layer) { return down_proj_weight_[layer]; }
+    
+    // Inference
+    int64_t infer(const int64_t* token_ids, size_t num_tokens);
+    
+    // Reset KV cache
+    void resetCache() { cache_len_ = 0; }
+    
+    const Qwen2Config& config() const { return config_; }
+};
+
+} // namespace llaisys::models
diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu
new file mode 100644
index 000000000..6ebaea621
--- /dev/null
+++ b/src/ops/add/nvidia/add_nvidia.cu
@@ -0,0 +1,61 @@
+#include "add_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void addKernel(T *c, const T *a, const T *b, size_t numel) {
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += stride) {
+        const float value = llaisys::device::nvidia::scalarToFloat<T>(a[idx])
+                          + llaisys::device::nvidia::scalarToFloat<T>(b[idx]);
+        c[idx] = llaisys::device::nvidia::floatToScalar<T>(value);
+    }
+}
+
+} // namespace
+
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel, llaisysStream_t stream) {
+    const int max_blocks = 4096;
+    const int threads = 256;
+    const int blocks = static_cast<int>(std::min<size_t>((numel + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    if (blocks == 0) {
+        return;
+    }
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        addKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(c),
+            reinterpret_cast<const float *>(a),
+            reinterpret_cast<const float *>(b),
+            numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        addKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(c),
+            reinterpret_cast<const llaisys::fp16_t *>(a),
+            reinterpret_cast<const llaisys::fp16_t *>(b),
+            numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        addKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(c),
+            reinterpret_cast<const llaisys::bf16_t *>(a),
+            reinterpret_cast<const llaisys::bf16_t *>(b),
+            numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/add/nvidia/add_nvidia.cuh b/src/ops/add/nvidia/add_nvidia.cuh
new file mode 100644
index 000000000..2ed37ba55
--- /dev/null
+++ b/src/ops/add/nvidia/add_nvidia.cuh
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel, llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t size);
+}
diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp
index a057330d7..39dc69698 100644
--- a/src/ops/add/op.cpp
+++ b/src/ops/add/op.cpp
@@ -4,6 +4,9 @@
 #include "../../utils.hpp"
 
 #include "cpu/add_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/add_nvidia.cuh"
+#endif
 
 namespace llaisys::ops {
 void add(tensor_t c, tensor_t a, tensor_t b) {
@@ -25,8 +28,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) {
         return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel(), llaisys::core::context().runtime().stream());
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp
new file mode 100644
index 000000000..942273f74
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.cpp
@@ -0,0 +1,45 @@
+#include "argmax_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+template <typename T, typename N>
+void argmax_(N *max_idx, T *max_val, const T *vals, size_t numel) {
+    max_idx[0] = 0;
+    max_val[0] = vals[0];
+    if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+        for (size_t i = 1; i < numel; i++) {
+            if (llaisys::utils::cast<float>(vals[i]) > llaisys::utils::cast<float>(max_val[0])) {
+                max_val[0] = vals[i];
+                max_idx[0] = static_cast<N>(i);
+            }
+        }
+    } else {
+        for (size_t i = 1; i < numel; i++) {
+            if (vals[i] > max_val[0]) {
+                max_val[0] = vals[i];
+                max_idx[0] = static_cast<N>(i);
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return argmax_(reinterpret_cast<int64_t *>(max_idx), reinterpret_cast<float *>(max_val), reinterpret_cast<const float *>(vals), numel);
+    case LLAISYS_DTYPE_BF16:
+        return argmax_(reinterpret_cast<int64_t *>(max_idx), reinterpret_cast<llaisys::bf16_t *>(max_val),
+                       reinterpret_cast<const llaisys::bf16_t *>(vals), numel);
+    case LLAISYS_DTYPE_F16:
+        return argmax_(reinterpret_cast<int64_t *>(max_idx), reinterpret_cast<llaisys::fp16_t *>(max_val),
+                       reinterpret_cast<const llaisys::fp16_t *>(vals), numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp
new file mode 100644
index 000000000..891a6d20a
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size);
+}
diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu
new file mode 100644
index 000000000..7e5e5172d
--- /dev/null
+++ b/src/ops/argmax/nvidia/argmax_nvidia.cu
@@ -0,0 +1,89 @@
+#include "argmax_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <limits>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void argmaxKernel(int64_t *max_idx, T *max_val, const T *vals, size_t numel) {
+    __shared__ float shared_vals[256];
+    __shared__ int64_t shared_idx[256];
+
+    const int tid = threadIdx.x;
+    float thread_max = -std::numeric_limits<float>::infinity();
+    int64_t thread_idx = 0;
+
+    for (size_t idx = static_cast<size_t>(tid); idx < numel; idx += blockDim.x) {
+        const float value = llaisys::device::nvidia::scalarToFloat<T>(vals[idx]);
+        if (value > thread_max) {
+            thread_max = value;
+            thread_idx = static_cast<int64_t>(idx);
+        }
+    }
+
+    shared_vals[tid] = thread_max;
+    shared_idx[tid] = thread_idx;
+    __syncthreads();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+        if (tid < offset) {
+            const float other_val = shared_vals[tid + offset];
+            const int64_t other_idx = shared_idx[tid + offset];
+            if (other_val > shared_vals[tid]) {
+                shared_vals[tid] = other_val;
+                shared_idx[tid] = other_idx;
+            }
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        max_idx[0] = shared_idx[0];
+        max_val[0] = llaisys::device::nvidia::floatToScalar<T>(shared_vals[0]);
+    }
+}
+
+} // namespace
+
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel, llaisysStream_t stream) {
+    if (numel == 0) {
+        return;
+    }
+
+    constexpr int threads = 256;
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+    auto *max_idx_ptr = reinterpret_cast<int64_t *>(max_idx);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        argmaxKernel<<<1, threads, 0, cuda_stream>>>(
+            max_idx_ptr,
+            reinterpret_cast<float *>(max_val),
+            reinterpret_cast<const float *>(vals),
+            numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        argmaxKernel<<<1, threads, 0, cuda_stream>>>(
+            max_idx_ptr,
+            reinterpret_cast<llaisys::fp16_t *>(max_val),
+            reinterpret_cast<const llaisys::fp16_t *>(vals),
+            numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        argmaxKernel<<<1, threads, 0, cuda_stream>>>(
+            max_idx_ptr,
+            reinterpret_cast<llaisys::bf16_t *>(max_val),
+            reinterpret_cast<const llaisys::bf16_t *>(vals),
+            numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cuh b/src/ops/argmax/nvidia/argmax_nvidia.cuh
new file mode 100644
index 000000000..d90c10555
--- /dev/null
+++ b/src/ops/argmax/nvidia/argmax_nvidia.cuh
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel, llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size);
+}
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d426..89c1d46fb 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,37 @@
 #include "op.hpp"
 
+#include "../../utils.hpp"
+#include "llaisys.h"
+
+#include "cpu/argmax_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/argmax_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
-    TO_BE_IMPLEMENTED();
+    // TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(max_idx, max_val, vals);
+    CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype());
+    ASSERT(max_idx->dtype() == llaisysDataType_t::LLAISYS_DTYPE_I64, "Argmax: the data type of max_idx must be LLAISYS_DTYPE_I64.");
+    ASSERT(max_idx->isContiguous() && max_val->isContiguous() && vals->isContiguous(), "Argmax: all tensors must be contiguout.");
+
+    if (vals->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+    }
+
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+
+    switch (vals->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel(),
+                              llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp
new file mode 100644
index 000000000..531eb279e
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.cpp
@@ -0,0 +1,32 @@
+#include "embedding_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+template <typename T, typename I>
+void embedding_(T *out, const I *index, const T *weight, size_t num_indices, size_t hidden_size) {
+    for (size_t i = 0; i < num_indices; i++) {
+        memcpy(out + i * hidden_size, weight + index[i] * hidden_size, hidden_size * sizeof(T));
+    }
+}
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, size_t hidden_size) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return embedding_(reinterpret_cast<float *>(out), reinterpret_cast<const int64_t *>(index), reinterpret_cast<const float *>(weight), num_indices, hidden_size);
+    case LLAISYS_DTYPE_BF16:
+        return embedding_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const int64_t *>(index),
+                          reinterpret_cast<const llaisys::bf16_t *>(weight), num_indices, hidden_size);
+    case LLAISYS_DTYPE_F16:
+        return embedding_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const int64_t *>(index),
+                          reinterpret_cast<const llaisys::fp16_t *>(weight), num_indices, hidden_size);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp
new file mode 100644
index 000000000..160bac617
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices, size_t hidden_size);
+}
\ No newline at end of file
diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu
new file mode 100644
index 000000000..4e91d6772
--- /dev/null
+++ b/src/ops/embedding/nvidia/embedding_nvidia.cu
@@ -0,0 +1,140 @@
+#include "embedding_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+#include <cstdint>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void embeddingKernel(T *out, const int64_t *index, const T *weight, size_t num_indices, size_t hidden_size) {
+    const size_t total = num_indices * hidden_size;
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+    for (size_t linear_idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < total; linear_idx += stride) {
+        const size_t row = linear_idx / hidden_size;
+        const size_t col = linear_idx % hidden_size;
+        const int64_t vocab_row = index[row];
+        out[linear_idx] = weight[static_cast<size_t>(vocab_row) * hidden_size + col];
+    }
+}
+
+template <typename T>
+__global__ void parallelEmbeddingKernel(T *out, const int64_t *index, const T *weight, size_t num_indices, size_t hidden_size,
+                                        size_t vocab_start, size_t vocab_end) {
+    const size_t total = num_indices * hidden_size;
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+    const T zero = llaisys::device::nvidia::floatToScalar<T>(0.0f);
+    for (size_t linear_idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < total; linear_idx += stride) {
+        const size_t row = linear_idx / hidden_size;
+        const size_t col = linear_idx % hidden_size;
+        const int64_t vocab_row = index[row];
+        if (vocab_row >= static_cast<int64_t>(vocab_start) && vocab_row < static_cast<int64_t>(vocab_end)) {
+            const size_t local_row = static_cast<size_t>(vocab_row) - vocab_start;
+            out[linear_idx] = weight[local_row * hidden_size + col];
+        } else {
+            out[linear_idx] = zero;
+        }
+    }
+}
+
+} // namespace
+
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices,
+               size_t hidden_size, llaisysStream_t stream) {
+    const size_t total = num_indices * hidden_size;
+    if (total == 0) {
+        return;
+    }
+
+    constexpr int threads = 256;
+    const int max_blocks = 4096;
+    const int blocks = static_cast<int>(std::min<size_t>((total + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    const auto *index_ptr = reinterpret_cast<const int64_t *>(index);
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        embeddingKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(out),
+            index_ptr,
+            reinterpret_cast<const float *>(weight),
+            num_indices,
+            hidden_size);
+        break;
+    case LLAISYS_DTYPE_F16:
+        embeddingKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            index_ptr,
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            num_indices,
+            hidden_size);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        embeddingKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            index_ptr,
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            num_indices,
+            hidden_size);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+void parallelEmbedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices,
+                       size_t hidden_size, size_t vocab_start, size_t vocab_end, llaisysStream_t stream) {
+    const size_t total = num_indices * hidden_size;
+    if (total == 0) {
+        return;
+    }
+
+    constexpr int threads = 256;
+    const int max_blocks = 4096;
+    const int blocks = static_cast<int>(std::min<size_t>((total + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    const auto *index_ptr = reinterpret_cast<const int64_t *>(index);
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        parallelEmbeddingKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(out),
+            index_ptr,
+            reinterpret_cast<const float *>(weight),
+            num_indices,
+            hidden_size,
+            vocab_start,
+            vocab_end);
+        break;
+    case LLAISYS_DTYPE_F16:
+        parallelEmbeddingKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            index_ptr,
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            num_indices,
+            hidden_size,
+            vocab_start,
+            vocab_end);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        parallelEmbeddingKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            index_ptr,
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            num_indices,
+            hidden_size,
+            vocab_start,
+            vocab_end);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cuh b/src/ops/embedding/nvidia/embedding_nvidia.cuh
new file mode 100644
index 000000000..a7d260c1a
--- /dev/null
+++ b/src/ops/embedding/nvidia/embedding_nvidia.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices,
+               size_t hidden_size, llaisysStream_t stream);
+void parallelEmbedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices,
+                       size_t hidden_size, size_t vocab_start, size_t vocab_end, llaisysStream_t stream);
+}
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d06..46cf57430 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,107 @@
 #include "op.hpp"
 
+#include "cpu/embedding_cpu.hpp"
+#include "llaisys.h"
+#include "../../utils.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/embedding_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
+
+namespace {
+
+template <typename T>
+void parallelEmbeddingCpuImpl(std::byte *out, const std::byte *index, const std::byte *weight, size_t num_indices, size_t hidden_size,
+                              size_t vocab_start, size_t vocab_end) {
+    auto *out_ptr = reinterpret_cast<T *>(out);
+    const auto *index_ptr = reinterpret_cast<const int64_t *>(index);
+    const auto *weight_ptr = reinterpret_cast<const T *>(weight);
+    for (size_t row = 0; row < num_indices; ++row) {
+        const int64_t vocab_row = index_ptr[row];
+        for (size_t col = 0; col < hidden_size; ++col) {
+            if (vocab_row >= static_cast<int64_t>(vocab_start) && vocab_row < static_cast<int64_t>(vocab_end)) {
+                const size_t local_row = static_cast<size_t>(vocab_row) - vocab_start;
+                out_ptr[row * hidden_size + col] = weight_ptr[local_row * hidden_size + col];
+            } else {
+                out_ptr[row * hidden_size + col] = llaisys::utils::cast<T>(0.0f);
+            }
+        }
+    }
+}
+
+void parallelEmbeddingCpu(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num_indices,
+                          size_t hidden_size, size_t vocab_start, size_t vocab_end) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return parallelEmbeddingCpuImpl<float>(out, index, weight, num_indices, hidden_size, vocab_start, vocab_end);
+    case LLAISYS_DTYPE_F16:
+        return parallelEmbeddingCpuImpl<llaisys::fp16_t>(out, index, weight, num_indices, hidden_size, vocab_start, vocab_end);
+    case LLAISYS_DTYPE_BF16:
+        return parallelEmbeddingCpuImpl<llaisys::bf16_t>(out, index, weight, num_indices, hidden_size, vocab_start, vocab_end);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace
+
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
-    TO_BE_IMPLEMENTED();
+    // TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, index, weight);
+    CHECK_SAME_DTYPE(out->dtype(), weight->dtype());
+    ASSERT(out->shape()[1] == weight->shape()[1], "");
+    ASSERT(out->shape()[0] == index->shape()[0], "");
+    ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), "");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->shape()[0], weight->shape()[1]);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->shape()[0], weight->shape()[1]);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->shape()[0], weight->shape()[1],
+                                 llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
+
+void parallelEmbedding(tensor_t out, tensor_t index, tensor_t weight_local, size_t vocab_start, size_t vocab_end) {
+    CHECK_SAME_DEVICE(out, index, weight_local);
+    CHECK_SAME_DTYPE(out->dtype(), weight_local->dtype());
+    ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "ParallelEmbedding: index must be int64.");
+    ASSERT(out->ndim() == 2, "ParallelEmbedding: out must be [num_tokens, hidden_size].");
+    ASSERT(index->ndim() == 1, "ParallelEmbedding: index must be [num_tokens].");
+    ASSERT(weight_local->ndim() == 2, "ParallelEmbedding: weight must be [local_vocab, hidden_size].");
+    ASSERT(out->shape()[0] == index->shape()[0], "ParallelEmbedding: num_tokens mismatch.");
+    ASSERT(out->shape()[1] == weight_local->shape()[1], "ParallelEmbedding: hidden size mismatch.");
+    ASSERT(out->isContiguous() && index->isContiguous() && weight_local->isContiguous(),
+           "ParallelEmbedding: all tensors must be contiguous.");
+    ASSERT(vocab_start <= vocab_end, "ParallelEmbedding: invalid vocab shard range.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return parallelEmbeddingCpu(out->data(), index->data(), weight_local->data(), out->dtype(), out->shape()[0], out->shape()[1], vocab_start, vocab_end);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return parallelEmbeddingCpu(out->data(), index->data(), weight_local->data(), out->dtype(), out->shape()[0], out->shape()[1], vocab_start, vocab_end);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::parallelEmbedding(out->data(), index->data(), weight_local->data(), out->dtype(), out->shape()[0], out->shape()[1],
+                                         vocab_start, vocab_end, llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/embedding/op.hpp b/src/ops/embedding/op.hpp
index 37216c0cf..d2acd3023 100644
--- a/src/ops/embedding/op.hpp
+++ b/src/ops/embedding/op.hpp
@@ -4,4 +4,5 @@
 
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight);
+void parallelEmbedding(tensor_t out, tensor_t index, tensor_t weight_local, size_t vocab_start, size_t vocab_end);
 }
diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp
new file mode 100644
index 000000000..5174d180e
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.cpp
@@ -0,0 +1,37 @@
+#include "linear_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+
+template <typename T>
+void linear_(T *out, const T *in, const T *weight, const T *bias, std::vector<size_t> out_shape, std::vector<size_t> in_shape, std::vector<size_t> weight_shape, std::vector<size_t> bias_shape) {
+    for (size_t i = 0; i < out_shape[0]; i++) {
+        for (size_t j = 0; j < out_shape[1]; j++) {
+            float val = 0.0F; // 使用 float 累加，避免 f16/bf16 精度损失
+            for (size_t k = 0; k < in_shape[1]; k++) {
+                val += llaisys::utils::cast<float>(in[i * in_shape[1] + k])
+                     * llaisys::utils::cast<float>(weight[j * weight_shape[1] + k]);
+            }
+            if (bias) {
+                val += llaisys::utils::cast<float>(bias[j]);
+            }
+            out[i * out_shape[1] + j] = llaisys::utils::cast<T>(val);
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, std::vector<size_t> out_shape, std::vector<size_t> in_shape, std::vector<size_t> weight_shape, std::vector<size_t> bias_shape) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return linear_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in), reinterpret_cast<const float *>(weight), reinterpret_cast<const float *>(bias), out_shape, in_shape, weight_shape, bias_shape);
+    case LLAISYS_DTYPE_BF16:
+        return linear_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in), reinterpret_cast<const llaisys::bf16_t *>(weight), reinterpret_cast<const llaisys::bf16_t *>(bias), out_shape, in_shape, weight_shape, bias_shape);
+    case LLAISYS_DTYPE_F16:
+        return linear_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in), reinterpret_cast<const llaisys::fp16_t *>(weight), reinterpret_cast<const llaisys::fp16_t *>(bias), out_shape, in_shape, weight_shape, bias_shape);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp
new file mode 100644
index 000000000..107dccc81
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, std::vector<size_t> out_shape, std::vector<size_t> in_shape, std::vector<size_t> weight_shape, std::vector<size_t> bias_shape);
+}
\ No newline at end of file
diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu
new file mode 100644
index 000000000..c707e4ecd
--- /dev/null
+++ b/src/ops/linear/nvidia/linear_nvidia.cu
@@ -0,0 +1,114 @@
+#include "linear_nvidia.cuh"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../device/nvidia/cuda_utils.cuh"
+#include "../../../device/nvidia/nvidia_resource.cuh"
+
+#include <algorithm>
+#include <cublas_v2.h>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+#define CUBLAS_CHECK(EXPR__)                                                                  \
+    do {                                                                                      \
+        cublasStatus_t status__ = (EXPR__);                                                   \
+        if (status__ != CUBLAS_STATUS_SUCCESS) {                                              \
+            throw std::runtime_error("cuBLAS call failed");                                   \
+        }                                                                                     \
+    } while (0)
+
+inline cudaDataType_t toCudaDataType(llaisysDataType_t type) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return CUDA_R_32F;
+    case LLAISYS_DTYPE_F16:
+        return CUDA_R_16F;
+    case LLAISYS_DTYPE_BF16:
+        return CUDA_R_16BF;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+template <typename T>
+__global__ void addBiasKernel(T *out, const T *bias, size_t numel, size_t cols) {
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += stride) {
+        const size_t col = idx % cols;
+        const float value = llaisys::device::nvidia::scalarToFloat<T>(out[idx])
+                          + llaisys::device::nvidia::scalarToFloat<T>(bias[col]);
+        out[idx] = llaisys::device::nvidia::floatToScalar<T>(value);
+    }
+}
+
+template <typename T>
+void launchBias(std::byte *out, const std::byte *bias, size_t numel, size_t cols, cudaStream_t stream) {
+    constexpr int threads = 256;
+    const int max_blocks = 4096;
+    const int blocks = static_cast<int>(std::min<size_t>((numel + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    addBiasKernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const T *>(bias),
+        numel,
+        cols);
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
+
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type,
+            size_t m, size_t n, size_t k, llaisysStream_t stream) {
+    if (m == 0 || n == 0 || k == 0) {
+        return;
+    }
+
+    auto &runtime = llaisys::core::context().runtime();
+    cublasHandle_t handle = llaisys::device::nvidia::resource(runtime.deviceId()).cublasHandle();
+    CUBLAS_CHECK(cublasSetStream(handle, llaisys::device::nvidia::toCudaStream(stream)));
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const cudaDataType_t data_type = toCudaDataType(type);
+
+    CUBLAS_CHECK(cublasGemmEx(
+        handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        static_cast<int>(n),
+        static_cast<int>(m),
+        static_cast<int>(k),
+        &alpha,
+        weight,
+        data_type,
+        static_cast<int>(k),
+        in,
+        data_type,
+        static_cast<int>(k),
+        &beta,
+        out,
+        data_type,
+        static_cast<int>(n),
+        CUBLAS_COMPUTE_32F,
+        CUBLAS_GEMM_DEFAULT));
+
+    if (bias != nullptr) {
+        const size_t numel = m * n;
+        const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+        switch (type) {
+        case LLAISYS_DTYPE_F32:
+            launchBias<float>(out, bias, numel, n, cuda_stream);
+            break;
+        case LLAISYS_DTYPE_F16:
+            launchBias<llaisys::fp16_t>(out, bias, numel, n, cuda_stream);
+            break;
+        case LLAISYS_DTYPE_BF16:
+            launchBias<llaisys::bf16_t>(out, bias, numel, n, cuda_stream);
+            break;
+        default:
+            EXCEPTION_UNSUPPORTED_DATATYPE(type);
+        }
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/linear/nvidia/linear_nvidia.cuh b/src/ops/linear/nvidia/linear_nvidia.cuh
new file mode 100644
index 000000000..8da88d895
--- /dev/null
+++ b/src/ops/linear/nvidia/linear_nvidia.cuh
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type,
+            size_t m, size_t n, size_t k, llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::nvidia {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, std::vector<size_t> out_shape, std::vector<size_t> in_shape, std::vector<size_t> weight_shape, std::vector<size_t> bias_shape);
+}
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f8655..3ef71e4c8 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,94 @@
 #include "op.hpp"
 
+#include "cpu/linear_cpu.hpp"
+#include "../rearrange/op.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/linear_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
+
+tensor_t gatherLastDim(const tensor_t &local) {
+    ASSERT(local != nullptr, "gatherLastDim input must not be null.");
+    ASSERT(local->isContiguous(), "gatherLastDim requires a contiguous tensor.");
+
+    auto &ctx = llaisys::core::context();
+    if (!ctx.distributedInitialized() || ctx.distributedWorldSize() == 1) {
+        return local;
+    }
+
+    auto gathered = ctx.allGather(local);
+    const size_t local_ndim = local->ndim();
+    ASSERT(local_ndim >= 1, "gatherLastDim requires tensor rank >= 1.");
+
+    std::vector<size_t> order;
+    order.reserve(local_ndim + 1);
+    for (size_t dim = 1; dim < local_ndim; ++dim) {
+        order.push_back(dim);
+    }
+    order.push_back(0);
+    order.push_back(local_ndim);
+
+    auto permuted = gathered->permute(order);
+    auto merged = llaisys::Tensor::create(permuted->shape(), permuted->dtype(), permuted->deviceType(), permuted->deviceId());
+    ops::rearrange(merged, permuted);
+
+    std::vector<size_t> final_shape = local->shape();
+    final_shape.back() *= static_cast<size_t>(ctx.distributedWorldSize());
+    return merged->view(final_shape);
+}
+
+void columnParallelLinear(tensor_t out_local, tensor_t in, tensor_t weight_local, tensor_t bias_local) {
+    linear(out_local, in, weight_local, bias_local);
+}
+
+void rowParallelLinear(tensor_t out, tensor_t in_local, tensor_t weight_local) {
+    auto &ctx = llaisys::core::context();
+    linear(out, in_local, weight_local, nullptr);
+    if (ctx.distributedInitialized() && ctx.distributedWorldSize() > 1) {
+        ctx.allReduce(out);
+    }
+}
+
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
-    TO_BE_IMPLEMENTED();
+    // TO_BE_IMPLEMENTED();
+    if (bias == nullptr) {
+        CHECK_SAME_DEVICE(out, in, weight);
+        CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+        ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), "Linear: all tensors must be contiguous.");
+    } else {
+        CHECK_SAME_DEVICE(out, in, weight, bias);
+        CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype(), bias->dtype());
+        ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous() && bias->isContiguous(), "Linear: all tensors must be contiguous.");
+        ASSERT(bias->shape()[0] == weight->shape()[0], "Linear: bias.shape[0] must be equal to weight.shape[0].");
+    }
+    ASSERT(out->shape()[0] == in->shape()[0], "Linear: out.shape[0] must be equal to in.shape[0].");
+    ASSERT(out->shape()[1] == weight->shape()[0], "Linear: out.shape[1] must be equal to weight.shape[0].");
+    ASSERT(in->shape()[1] == weight->shape()[1], "Linear: in.shape[1] must be equal to weight.shape[1].");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        if (bias == nullptr) {
+            return cpu::linear(out->data(), in->data(), weight->data(), nullptr, out->dtype(), out->shape(), in->shape(), weight->shape(), {});
+        }
+        return cpu::linear(out->data(), in->data(), weight->data(), bias->data(), out->dtype(), out->shape(), in->shape(), weight->shape(), bias->shape());
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        if (bias == nullptr) {
+            return cpu::linear(out->data(), in->data(), weight->data(), nullptr, out->dtype(), out->shape(), in->shape(), weight->shape(), {});
+        } else {
+            return cpu::linear(out->data(), in->data(), weight->data(), bias->data(), out->dtype(), out->shape(), in->shape(), weight->shape(), bias->shape());
+        }
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::linear(out->data(), in->data(), weight->data(), bias == nullptr ? nullptr : bias->data(), out->dtype(),
+                              out->shape()[0], out->shape()[1], in->shape()[1], llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/linear/op.hpp b/src/ops/linear/op.hpp
index 7bf06f017..75af49ac5 100644
--- a/src/ops/linear/op.hpp
+++ b/src/ops/linear/op.hpp
@@ -4,4 +4,7 @@
 
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias);
+void columnParallelLinear(tensor_t out_local, tensor_t in, tensor_t weight_local, tensor_t bias_local);
+void rowParallelLinear(tensor_t out, tensor_t in_local, tensor_t weight_local);
+tensor_t gatherLastDim(const tensor_t &local);
 }
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp
new file mode 100644
index 000000000..b34b00b54
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp
@@ -0,0 +1,78 @@
+#include "rearrange_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cstring>
+
+template <typename T>
+void rearrange_(T *out, const T *in,
+                const std::vector<size_t> &shape,
+                const std::vector<ptrdiff_t> &out_strides,
+                const std::vector<ptrdiff_t> &in_strides) {
+    size_t ndim = shape.size();
+    
+    if (ndim == 0) {
+        out[0] = in[0];
+        return;
+    }
+    
+    // Calculate total number of elements
+    size_t numel = 1;
+    for (size_t i = 0; i < ndim; ++i) {
+        numel *= shape[i];
+    }
+    
+    // Use indices to iterate through all elements
+    std::vector<size_t> indices(ndim, 0);
+    
+    for (size_t i = 0; i < numel; ++i) {
+        // Calculate source and destination offsets
+        ptrdiff_t out_offset = 0;
+        ptrdiff_t in_offset = 0;
+        for (size_t d = 0; d < ndim; ++d) {
+            out_offset += indices[d] * out_strides[d];
+            in_offset += indices[d] * in_strides[d];
+        }
+        
+        // Copy element
+        out[out_offset] = in[in_offset];
+        
+        // Increment indices (like counting in mixed radix)
+        for (ptrdiff_t d = ndim - 1; d >= 0; --d) {
+            indices[d]++;
+            if (indices[d] < shape[d]) {
+                break;
+            }
+            indices[d] = 0;
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rearrange(std::byte *out, const std::byte *in,
+               const std::vector<size_t> &shape,
+               const std::vector<ptrdiff_t> &out_strides,
+               const std::vector<ptrdiff_t> &in_strides,
+               llaisysDataType_t type) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rearrange_(reinterpret_cast<float *>(out),
+                          reinterpret_cast<const float *>(in),
+                          shape, out_strides, in_strides);
+    case LLAISYS_DTYPE_BF16:
+        return rearrange_(reinterpret_cast<llaisys::bf16_t *>(out),
+                          reinterpret_cast<const llaisys::bf16_t *>(in),
+                          shape, out_strides, in_strides);
+    case LLAISYS_DTYPE_F16:
+        return rearrange_(reinterpret_cast<llaisys::fp16_t *>(out),
+                          reinterpret_cast<const llaisys::fp16_t *>(in),
+                          shape, out_strides, in_strides);
+    case LLAISYS_DTYPE_I64:
+        return rearrange_(reinterpret_cast<int64_t *>(out),
+                          reinterpret_cast<const int64_t *>(in),
+                          shape, out_strides, in_strides);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp
new file mode 100644
index 000000000..d9fafb7b0
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp
@@ -0,0 +1,13 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::cpu {
+void rearrange(std::byte *out, const std::byte *in,
+               const std::vector<size_t> &shape,
+               const std::vector<ptrdiff_t> &out_strides,
+               const std::vector<ptrdiff_t> &in_strides,
+               llaisysDataType_t type);
+}
diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/ops/rearrange/nvidia/rearrange_nvidia.cu
new file mode 100644
index 000000000..49180db76
--- /dev/null
+++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cu
@@ -0,0 +1,95 @@
+#include "rearrange_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void rearrangeKernel(T *out, const T *in, size_t numel, size_t ndim, const size_t *shape, const ptrdiff_t *out_strides,
+                                const ptrdiff_t *in_strides) {
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+    for (size_t linear_idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < numel; linear_idx += stride) {
+        size_t remaining = linear_idx;
+        ptrdiff_t out_offset = 0;
+        ptrdiff_t in_offset = 0;
+        for (ptrdiff_t dim = static_cast<ptrdiff_t>(ndim) - 1; dim >= 0; --dim) {
+            const size_t coord = remaining % shape[dim];
+            remaining /= shape[dim];
+            out_offset += static_cast<ptrdiff_t>(coord) * out_strides[dim];
+            in_offset += static_cast<ptrdiff_t>(coord) * in_strides[dim];
+        }
+        out[out_offset] = in[in_offset];
+    }
+}
+
+template <typename T>
+void launchRearrange(std::byte *out, const std::byte *in, size_t numel, size_t ndim, const size_t *shape_dev,
+                     const ptrdiff_t *out_strides_dev, const ptrdiff_t *in_strides_dev, cudaStream_t stream) {
+    constexpr int threads = 256;
+    const int max_blocks = 4096;
+    const int blocks = static_cast<int>(std::min<size_t>((numel + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    rearrangeKernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const T *>(in),
+        numel,
+        ndim,
+        shape_dev,
+        out_strides_dev,
+        in_strides_dev);
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
+
+void rearrange(std::byte *out, const std::byte *in, const std::vector<size_t> &shape, const std::vector<ptrdiff_t> &out_strides,
+               const std::vector<ptrdiff_t> &in_strides, llaisysDataType_t type, llaisysStream_t stream) {
+    size_t numel = 1;
+    for (size_t dim : shape) {
+        numel *= dim;
+    }
+    if (numel == 0) {
+        return;
+    }
+
+    const size_t ndim = shape.size();
+    auto cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    size_t *shape_dev = nullptr;
+    ptrdiff_t *out_strides_dev = nullptr;
+    ptrdiff_t *in_strides_dev = nullptr;
+    CUDA_CHECK(cudaMalloc(&shape_dev, ndim * sizeof(size_t)));
+    CUDA_CHECK(cudaMalloc(&out_strides_dev, ndim * sizeof(ptrdiff_t)));
+    CUDA_CHECK(cudaMalloc(&in_strides_dev, ndim * sizeof(ptrdiff_t)));
+    CUDA_CHECK(cudaMemcpyAsync(shape_dev, shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, cuda_stream));
+    CUDA_CHECK(cudaMemcpyAsync(out_strides_dev, out_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, cuda_stream));
+    CUDA_CHECK(cudaMemcpyAsync(in_strides_dev, in_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, cuda_stream));
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        launchRearrange<float>(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream);
+        break;
+    case LLAISYS_DTYPE_F16:
+        launchRearrange<llaisys::fp16_t>(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        launchRearrange<llaisys::bf16_t>(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream);
+        break;
+    case LLAISYS_DTYPE_I64:
+        launchRearrange<int64_t>(out, in, numel, ndim, shape_dev, out_strides_dev, in_strides_dev, cuda_stream);
+        break;
+    default:
+        CUDA_CHECK(cudaFree(shape_dev));
+        CUDA_CHECK(cudaFree(out_strides_dev));
+        CUDA_CHECK(cudaFree(in_strides_dev));
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaFree(shape_dev));
+    CUDA_CHECK(cudaFree(out_strides_dev));
+    CUDA_CHECK(cudaFree(in_strides_dev));
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cuh b/src/ops/rearrange/nvidia/rearrange_nvidia.cuh
new file mode 100644
index 000000000..213623a00
--- /dev/null
+++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cuh
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::nvidia {
+void rearrange(std::byte *out, const std::byte *in, const std::vector<size_t> &shape, const std::vector<ptrdiff_t> &out_strides,
+               const std::vector<ptrdiff_t> &in_strides, llaisysDataType_t type, llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::nvidia {
+void rearrange(std::byte *out, const std::byte *in, const std::vector<size_t> &shape, const std::vector<ptrdiff_t> &out_strides, const std::vector<ptrdiff_t> &in_strides, llaisysDataType_t type);
+}
diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp
index 017a6ae59..5b0d28e11 100644
--- a/src/ops/rearrange/op.cpp
+++ b/src/ops/rearrange/op.cpp
@@ -1,7 +1,47 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rearrange_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/rearrange_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
 void rearrange(tensor_t out, tensor_t in) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in);
+    CHECK_SAME_SHAPE(out->shape(), in->shape());
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+    
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rearrange(out->data(), in->data(),
+                              out->shape(), out->strides(), in->strides(),
+                              out->dtype());
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rearrange(out->data(), in->data(),
+                              out->shape(), out->strides(), in->strides(),
+                              out->dtype());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        if (out->isContiguous() && in->isContiguous()) {
+            return llaisys::core::context().runtime().api()->memcpy_async(
+                out->data(),
+                in->data(),
+                out->numel() * out->elementSize(),
+                LLAISYS_MEMCPY_D2D,
+                llaisys::core::context().runtime().stream());
+        }
+        return nvidia::rearrange(out->data(), in->data(), out->shape(), out->strides(), in->strides(), out->dtype(),
+                                 llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
new file mode 100644
index 000000000..e9c960307
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
@@ -0,0 +1,60 @@
+#include "rms_norm_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+
+/**
+ * RMS Norm 核心计算
+ *
+ * 内存布局 (行优先):
+ *   input/out 视为 [batch, hidden_size] 的二维数组
+ *   weight 是长度为 hidden_size 的一维数组
+ *
+ * 计算流程 (对每个 batch i):
+ *   1. 计算 RMS_i = sqrt(mean(input[i][:]^2) + eps)
+ *   2. out[i][j] = (input[i][j] / RMS_i) * weight[j]
+ *
+ * 索引计算:
+ *   - input[i][j] = input[i * hidden_size + j]
+ *   - weight[j] 对所有 batch 共享
+ */
+template <typename T>
+void rms_norm_(T *out, const T *input, const T *weight, size_t batch, size_t hidden_size, float eps) {
+    // 遍历每个独立的向量 (batch 维度)
+    for (size_t i = 0; i < batch; i++) {
+        // Step 1: 计算 sum(x^2)，在 hidden_size 维度上累加
+        float sum_sq = 0.0F;
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = llaisys::utils::cast<float>(input[i * hidden_size + j]);
+            sum_sq += val * val;
+        }
+        // Step 2: 计算 RMS = sqrt(mean(x^2) + eps)
+        float rms = std::sqrt(sum_sq / static_cast<float>(hidden_size) + eps);
+
+        // Step 3: 归一化并乘以 weight
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = llaisys::utils::cast<float>(input[i * hidden_size + j]);
+            float wgt = llaisys::utils::cast<float>(weight[j]);
+            out[i * hidden_size + j] = llaisys::utils::cast<T>((val / rms) * wgt);
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, size_t hidden_size, float eps) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rms_norm_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(input),
+                         reinterpret_cast<const float *>(weight), batch, hidden_size, eps);
+    case LLAISYS_DTYPE_BF16:
+        return rms_norm_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(input),
+                         reinterpret_cast<const llaisys::bf16_t *>(weight), batch, hidden_size, eps);
+    case LLAISYS_DTYPE_F16:
+        return rms_norm_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(input),
+                         reinterpret_cast<const llaisys::fp16_t *>(weight), batch, hidden_size, eps);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
new file mode 100644
index 000000000..7f3fa4f97
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, size_t hidden_size, float eps);
+}
diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
new file mode 100644
index 000000000..fd4b005fe
--- /dev/null
+++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
@@ -0,0 +1,86 @@
+#include "rms_norm_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void rmsNormKernel(T *out, const T *input, const T *weight, size_t hidden_size, float eps) {
+    __shared__ float shared_sum[256];
+
+    const size_t row = static_cast<size_t>(blockIdx.x);
+    const size_t row_offset = row * hidden_size;
+    const int tid = threadIdx.x;
+
+    float sum_sq = 0.0f;
+    for (size_t col = static_cast<size_t>(tid); col < hidden_size; col += blockDim.x) {
+        const float value = llaisys::device::nvidia::scalarToFloat<T>(input[row_offset + col]);
+        sum_sq += value * value;
+    }
+
+    shared_sum[tid] = sum_sq;
+    __syncthreads();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+        if (tid < offset) {
+            shared_sum[tid] += shared_sum[tid + offset];
+        }
+        __syncthreads();
+    }
+
+    const float inv_rms = rsqrtf(shared_sum[0] / static_cast<float>(hidden_size) + eps);
+    for (size_t col = static_cast<size_t>(tid); col < hidden_size; col += blockDim.x) {
+        const float value = llaisys::device::nvidia::scalarToFloat<T>(input[row_offset + col]);
+        const float scale = llaisys::device::nvidia::scalarToFloat<T>(weight[col]);
+        out[row_offset + col] = llaisys::device::nvidia::floatToScalar<T>(value * inv_rms * scale);
+    }
+}
+
+} // namespace
+
+void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch,
+              size_t hidden_size, float eps, llaisysStream_t stream) {
+    if (batch == 0 || hidden_size == 0) {
+        return;
+    }
+
+    constexpr int threads = 256;
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        rmsNormKernel<<<static_cast<int>(batch), threads, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(input),
+            reinterpret_cast<const float *>(weight),
+            hidden_size,
+            eps);
+        break;
+    case LLAISYS_DTYPE_F16:
+        rmsNormKernel<<<static_cast<int>(batch), threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(input),
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            hidden_size,
+            eps);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        rmsNormKernel<<<static_cast<int>(batch), threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(input),
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            hidden_size,
+            eps);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
new file mode 100644
index 000000000..3688b93a2
--- /dev/null
+++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch,
+              size_t hidden_size, float eps, llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rms_norm(std::byte *out, const std::byte *input, const std::byte *weight, llaisysDataType_t type, size_t batch, size_t hidden_size, float eps);
+}
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9d..803c957c3 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,62 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rms_norm_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/rms_norm_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
+/**
+ * RMS Norm (Root Mean Square Layer Normalization)
+ *
+ * 公式: out[i][j] = (in[i][j] / RMS(in[i])) * weight[j]
+ *       其中 RMS(x) = sqrt(mean(x^2) + eps)
+ *
+ * 参数说明:
+ *   - batch: 需要独立归一化的向量数量，等于 in.numel() / hidden_size
+ *            例如 in 形状为 [2, 3, 768]，则 batch = 2*3 = 6
+ *   - hidden_size: 每个向量的特征维度，即 in 的最后一个维度
+ *                  RMS 在此维度上计算，weight 的长度也等于 hidden_size
+ *
+ * 张量形状:
+ *   - in/out: [..., hidden_size]  (任意前导维度，最后一维是 hidden_size)
+ *   - weight: [hidden_size]       (所有向量共享同一组 weight)
+ */
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, weight);
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+    CHECK_SAME_SHAPE(out->shape(), in->shape());
+    ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), "RmsNorm: all tensors must be contiguous.");
+
+    ASSERT(in->ndim() >= 1, "RmsNorm: in must have at least 1 dimension.");
+    ASSERT(weight->ndim() == 1, "RmsNorm: weight must be 1D.");
+
+    // hidden_size: in 的最后一个维度，RMS 在此维度上计算
+    size_t hidden_size = in->shape().back();
+    // batch: 独立归一化的向量数量 (前导维度的乘积)
+    size_t batch = in->numel() / hidden_size;
+    
+    ASSERT(weight->shape()[0] == hidden_size, "RmsNorm: weight.shape[0] must be equal to in's last dimension.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch, hidden_size, eps);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch, hidden_size, eps);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch, hidden_size, eps,
+                                llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp
new file mode 100644
index 000000000..1f9734252
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.cpp
@@ -0,0 +1,53 @@
+#include "rope_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstdint>
+
+template <typename T>
+void rope_(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads, size_t head_dim) {
+    const size_t half_dim = head_dim / 2;
+    // Precompute inverse frequencies (denominators) for each dimension j
+    // To match PyTorch's behavior: phi = pos / (theta ^ (2j/d))
+    std::vector<float> inv_freqs(half_dim);
+    for (size_t j = 0; j < half_dim; ++j) {
+        inv_freqs[j] = static_cast<float>(std::pow(static_cast<double>(theta), 2.0 * static_cast<double>(j) / static_cast<double>(head_dim)));
+    }
+
+    for (size_t i = 0; i < seq_len; ++i) {
+        const float pos = static_cast<float>(pos_ids[i]);
+        for (size_t h = 0; h < n_heads; ++h) {
+            const size_t base_idx = (i * n_heads + h) * head_dim;
+            const T* in_ptr = in + base_idx;
+            T* out_ptr = out + base_idx;
+            for (size_t j = 0; j < half_dim; ++j) {
+                float phi = pos / inv_freqs[j];
+                float cos_phi = std::cos(phi);
+                float sin_phi = std::sin(phi);
+                // a = in[j], b = in[j + half_dim]
+                float a = llaisys::utils::cast<float>(in_ptr[j]);
+                float b = llaisys::utils::cast<float>(in_ptr[j + half_dim]);
+                float a_prime = a * cos_phi - b * sin_phi;
+                float b_prime = b * cos_phi + a * sin_phi;
+                out_ptr[j] = llaisys::utils::cast<T>(a_prime);
+                out_ptr[j + half_dim] = llaisys::utils::cast<T>(b_prime);
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rope_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in), pos_ids, theta, seq_len, n_heads, head_dim);
+    case LLAISYS_DTYPE_BF16:
+        return rope_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in), pos_ids, theta, seq_len, n_heads, head_dim);
+    case LLAISYS_DTYPE_F16:
+        return rope_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in), pos_ids, theta, seq_len, n_heads, head_dim);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp
new file mode 100644
index 000000000..ca300f8bf
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim);
+}
\ No newline at end of file
diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu
new file mode 100644
index 000000000..b24016927
--- /dev/null
+++ b/src/ops/rope/nvidia/rope_nvidia.cu
@@ -0,0 +1,88 @@
+#include "rope_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void ropeKernel(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads, size_t head_dim) {
+    const size_t half_dim = head_dim / 2;
+    const size_t total = seq_len * n_heads * half_dim;
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+
+    for (size_t linear_idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; linear_idx < total; linear_idx += stride) {
+        const size_t j = linear_idx % half_dim;
+        const size_t tmp = linear_idx / half_dim;
+        const size_t h = tmp % n_heads;
+        const size_t i = tmp / n_heads;
+        const size_t base_idx = (i * n_heads + h) * head_dim;
+
+        const float pos = static_cast<float>(pos_ids[i]);
+        const float phi = pos / powf(theta, 2.0f * static_cast<float>(j) / static_cast<float>(head_dim));
+        const float cos_phi = cosf(phi);
+        const float sin_phi = sinf(phi);
+
+        const float a = llaisys::device::nvidia::scalarToFloat<T>(in[base_idx + j]);
+        const float b = llaisys::device::nvidia::scalarToFloat<T>(in[base_idx + j + half_dim]);
+        out[base_idx + j] = llaisys::device::nvidia::floatToScalar<T>(a * cos_phi - b * sin_phi);
+        out[base_idx + j + half_dim] = llaisys::device::nvidia::floatToScalar<T>(b * cos_phi + a * sin_phi);
+    }
+}
+
+} // namespace
+
+void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len,
+          size_t n_heads, size_t head_dim, llaisysStream_t stream) {
+    const size_t total = seq_len * n_heads * (head_dim / 2);
+    if (total == 0) {
+        return;
+    }
+
+    constexpr int threads = 256;
+    const int max_blocks = 4096;
+    const int blocks = static_cast<int>(std::min<size_t>((total + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        ropeKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            pos_ids,
+            theta,
+            seq_len,
+            n_heads,
+            head_dim);
+        break;
+    case LLAISYS_DTYPE_F16:
+        ropeKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            pos_ids,
+            theta,
+            seq_len,
+            n_heads,
+            head_dim);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        ropeKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            pos_ids,
+            theta,
+            seq_len,
+            n_heads,
+            head_dim);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rope/nvidia/rope_nvidia.cuh b/src/ops/rope/nvidia/rope_nvidia.cuh
new file mode 100644
index 000000000..b9f636564
--- /dev/null
+++ b/src/ops/rope/nvidia/rope_nvidia.cuh
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llaisys::ops::nvidia {
+void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len,
+          size_t n_heads, size_t head_dim, llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rope(std::byte *out, const std::byte *in, const int64_t *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim);
+}
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64e..dfe24b350 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,46 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rope_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/rope_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, pos_ids);
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+    ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), "RoPE: all tensors must be contiguous.");
+    ASSERT(out->shape() == in->shape(), "RoPE: out and in must have same shape.");
+    ASSERT(out->ndim() == 3, "RoPE: out must be 3D tensor [seq_len, n_heads, head_dim].");
+    ASSERT(in->ndim() == 3, "RoPE: in must be 3D tensor [seq_len, n_heads, head_dim].");
+    ASSERT(pos_ids->ndim() == 1, "RoPE: pos_ids must be 1D tensor [seq_len].");
+    ASSERT(pos_ids->shape()[0] == out->shape()[0], "RoPE: pos_ids length must match seq_len.");
+    ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be int64.");
+    ASSERT(out->shape()[2] % 2 == 0, "RoPE: head_dim must be even.");
+
+    size_t seq_len = out->shape()[0];
+    size_t n_heads = out->shape()[1];
+    size_t head_dim = out->shape()[2];
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rope(out->data(), in->data(), reinterpret_cast<const int64_t*>(pos_ids->data()), theta, out->dtype(), seq_len, n_heads, head_dim);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rope(out->data(), in->data(), reinterpret_cast<const int64_t*>(pos_ids->data()), theta, out->dtype(), seq_len, n_heads, head_dim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::rope(out->data(), in->data(), reinterpret_cast<const int64_t *>(pos_ids->data()), theta, out->dtype(), seq_len,
+                            n_heads, head_dim, llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp
new file mode 100644
index 000000000..ad735cea0
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp
@@ -0,0 +1,113 @@
+#include "self_attention_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <limits>
+#include <vector>
+
+template <typename T>
+void self_attention_(T *attn_val, const T *q, const T *k, const T *v,
+                     float scale, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim) {
+    // Calculate the group size for GQA (Grouped Query Attention)
+    // Each kv head is shared by (nhead / nkvhead) query heads
+    const size_t group_size = nhead / nkvhead;
+    
+    // For each query position
+    for (size_t qi = 0; qi < qlen; ++qi) {
+        // For each attention head
+        for (size_t h = 0; h < nhead; ++h) {
+            // Find the corresponding kv head for this query head
+            const size_t kv_h = h / group_size;
+            
+            // Compute attention scores for this query position and head
+            // Q[qi, h, :] @ K[:, kv_h, :].T -> scores[kvlen]
+            std::vector<double> scores(kvlen);
+            
+            // Calculate Q @ K^T * scale
+            for (size_t ki = 0; ki < kvlen; ++ki) {
+                double score = 0.0;
+                for (size_t d = 0; d < head_dim; ++d) {
+                    // q index: [qi, h, d] -> qi * nhead * head_dim + h * head_dim + d
+                    // k index: [ki, kv_h, d] -> ki * nkvhead * head_dim + kv_h * head_dim + d
+                    double q_val = llaisys::utils::cast<double>(q[qi * nhead * head_dim + h * head_dim + d]);
+                    double k_val = llaisys::utils::cast<double>(k[ki * nkvhead * head_dim + kv_h * head_dim + d]);
+                    score += q_val * k_val;
+                }
+                scores[ki] = score * static_cast<double>(scale);
+            }
+            
+            // Apply causal mask: for position qi in query, we can only attend to positions 0..qi+(kvlen-qlen)
+            // The causal mask is: temp_mask = torch.ones(L, S).tril(diagonal=S-L)
+            // attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            // This means: for query position qi, we can attend to key positions 0 to (kvlen - qlen + qi)
+            const size_t max_attend = kvlen - qlen + qi;
+            for (size_t ki = max_attend + 1; ki < kvlen; ++ki) {
+                scores[ki] = -std::numeric_limits<double>::infinity();
+            }
+            
+            // Compute softmax
+            // Find max for numerical stability
+            double max_score = scores[0];
+            for (size_t ki = 1; ki < kvlen; ++ki) {
+                if (scores[ki] > max_score) {
+                    max_score = scores[ki];
+                }
+            }
+            
+            // Compute exp and sum
+            double sum_exp = 0.0;
+            for (size_t ki = 0; ki < kvlen; ++ki) {
+                scores[ki] = std::exp(scores[ki] - max_score);
+                sum_exp += scores[ki];
+            }
+            
+            // Normalize
+            for (size_t ki = 0; ki < kvlen; ++ki) {
+                scores[ki] /= sum_exp;
+            }
+            
+            // Compute attention output: softmax(scores) @ V
+            // attn_val[qi, h, :] = sum over ki of (scores[ki] * V[ki, kv_h, :])
+            for (size_t d = 0; d < head_dim; ++d) {
+                double out_val = 0.0;
+                for (size_t ki = 0; ki < kvlen; ++ki) {
+                    // v index: [ki, kv_h, d] -> ki * nkvhead * head_dim + kv_h * head_dim + d
+                    double v_val = llaisys::utils::cast<double>(v[ki * nkvhead * head_dim + kv_h * head_dim + d]);
+                    out_val += scores[ki] * v_val;
+                }
+                // attn_val index: [qi, h, d] -> qi * nhead * head_dim + h * head_dim + d
+                attn_val[qi * nhead * head_dim + h * head_dim + d] = llaisys::utils::cast<T>(out_val);
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                    float scale, llaisysDataType_t type,
+                    size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return self_attention_(reinterpret_cast<float *>(attn_val),
+                               reinterpret_cast<const float *>(q),
+                               reinterpret_cast<const float *>(k),
+                               reinterpret_cast<const float *>(v),
+                               scale, qlen, kvlen, nhead, nkvhead, head_dim);
+    case LLAISYS_DTYPE_BF16:
+        return self_attention_(reinterpret_cast<llaisys::bf16_t *>(attn_val),
+                               reinterpret_cast<const llaisys::bf16_t *>(q),
+                               reinterpret_cast<const llaisys::bf16_t *>(k),
+                               reinterpret_cast<const llaisys::bf16_t *>(v),
+                               scale, qlen, kvlen, nhead, nkvhead, head_dim);
+    case LLAISYS_DTYPE_F16:
+        return self_attention_(reinterpret_cast<llaisys::fp16_t *>(attn_val),
+                               reinterpret_cast<const llaisys::fp16_t *>(q),
+                               reinterpret_cast<const llaisys::fp16_t *>(k),
+                               reinterpret_cast<const llaisys::fp16_t *>(v),
+                               scale, qlen, kvlen, nhead, nkvhead, head_dim);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp
new file mode 100644
index 000000000..83e56b126
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp
@@ -0,0 +1,10 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, 
+                    float scale, llaisysDataType_t type, 
+                    size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim);
+}
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
new file mode 100644
index 000000000..af559e99b
--- /dev/null
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
@@ -0,0 +1,145 @@
+#include "self_attention_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void selfAttentionKernel(T *attn_val, const T *q, const T *k, const T *v, float scale, size_t qlen, size_t kvlen,
+                                    size_t nhead, size_t nkvhead, size_t head_dim) {
+    extern __shared__ float shared_mem[];
+    float *scores = shared_mem;
+    float *reduce_buf = shared_mem + kvlen;
+
+    const size_t qi = static_cast<size_t>(blockIdx.x);
+    const size_t h = static_cast<size_t>(blockIdx.y);
+    const size_t group_size = nhead / nkvhead;
+    const size_t kv_h = h / group_size;
+    const int tid = threadIdx.x;
+
+    const size_t q_base = (qi * nhead + h) * head_dim;
+    const size_t max_attend = kvlen - qlen + qi;
+
+    float thread_max = -INFINITY;
+    for (size_t ki = static_cast<size_t>(tid); ki < kvlen; ki += blockDim.x) {
+        float score = -INFINITY;
+        if (ki <= max_attend) {
+            score = 0.0f;
+            const size_t k_base = (ki * nkvhead + kv_h) * head_dim;
+            for (size_t d = 0; d < head_dim; ++d) {
+                score += llaisys::device::nvidia::scalarToFloat<T>(q[q_base + d])
+                       * llaisys::device::nvidia::scalarToFloat<T>(k[k_base + d]);
+            }
+            score *= scale;
+        }
+        scores[ki] = score;
+        if (score > thread_max) {
+            thread_max = score;
+        }
+    }
+
+    reduce_buf[tid] = thread_max;
+    __syncthreads();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+        if (tid < offset) {
+            reduce_buf[tid] = fmaxf(reduce_buf[tid], reduce_buf[tid + offset]);
+        }
+        __syncthreads();
+    }
+    const float max_score = reduce_buf[0];
+
+    float thread_sum = 0.0f;
+    for (size_t ki = static_cast<size_t>(tid); ki < kvlen; ki += blockDim.x) {
+        const float prob = expf(scores[ki] - max_score);
+        scores[ki] = prob;
+        thread_sum += prob;
+    }
+
+    reduce_buf[tid] = thread_sum;
+    __syncthreads();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+        if (tid < offset) {
+            reduce_buf[tid] += reduce_buf[tid + offset];
+        }
+        __syncthreads();
+    }
+    const float inv_sum = 1.0f / reduce_buf[0];
+
+    for (size_t d = static_cast<size_t>(tid); d < head_dim; d += blockDim.x) {
+        float out_val = 0.0f;
+        for (size_t ki = 0; ki < kvlen; ++ki) {
+            const size_t v_base = (ki * nkvhead + kv_h) * head_dim;
+            out_val += (scores[ki] * inv_sum) * llaisys::device::nvidia::scalarToFloat<T>(v[v_base + d]);
+        }
+        attn_val[q_base + d] = llaisys::device::nvidia::floatToScalar<T>(out_val);
+    }
+}
+
+} // namespace
+
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale,
+                    llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim,
+                    llaisysStream_t stream) {
+    if (qlen == 0 || kvlen == 0 || nhead == 0 || nkvhead == 0 || head_dim == 0) {
+        return;
+    }
+
+    constexpr int threads = 128;
+    const dim3 grid(static_cast<unsigned int>(qlen), static_cast<unsigned int>(nhead), 1);
+    const size_t shared_bytes = (kvlen + threads) * sizeof(float);
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        selfAttentionKernel<<<grid, threads, shared_bytes, cuda_stream>>>(
+            reinterpret_cast<float *>(attn_val),
+            reinterpret_cast<const float *>(q),
+            reinterpret_cast<const float *>(k),
+            reinterpret_cast<const float *>(v),
+            scale,
+            qlen,
+            kvlen,
+            nhead,
+            nkvhead,
+            head_dim);
+        break;
+    case LLAISYS_DTYPE_F16:
+        selfAttentionKernel<<<grid, threads, shared_bytes, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(attn_val),
+            reinterpret_cast<const llaisys::fp16_t *>(q),
+            reinterpret_cast<const llaisys::fp16_t *>(k),
+            reinterpret_cast<const llaisys::fp16_t *>(v),
+            scale,
+            qlen,
+            kvlen,
+            nhead,
+            nkvhead,
+            head_dim);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        selfAttentionKernel<<<grid, threads, shared_bytes, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(attn_val),
+            reinterpret_cast<const llaisys::bf16_t *>(q),
+            reinterpret_cast<const llaisys::bf16_t *>(k),
+            reinterpret_cast<const llaisys::bf16_t *>(v),
+            scale,
+            qlen,
+            kvlen,
+            nhead,
+            nkvhead,
+            head_dim);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cuh b/src/ops/self_attention/nvidia/self_attention_nvidia.cuh
new file mode 100644
index 000000000..b702a8082
--- /dev/null
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cuh
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale,
+                    llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim,
+                    llaisysStream_t stream);
+}
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nhead, size_t nkvhead, size_t head_dim);
+}
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d620142..dd5e12686 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,65 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/self_attention_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/self_attention_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(attn_val, q, k, v);
+    CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype());
+    ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(),
+           "SelfAttention: all tensors must be contiguous.");
+    
+    // Shape checks
+    // q: [qlen, nhead, head_dim]
+    // k: [kvlen, nkvhead, head_dim]
+    // v: [kvlen, nkvhead, head_dim]
+    // attn_val: [qlen, nhead, head_dim]
+    ASSERT(q->ndim() == 3, "SelfAttention: q must be 3D tensor [qlen, nhead, head_dim].");
+    ASSERT(k->ndim() == 3, "SelfAttention: k must be 3D tensor [kvlen, nkvhead, head_dim].");
+    ASSERT(v->ndim() == 3, "SelfAttention: v must be 3D tensor [kvlen, nkvhead, head_dim].");
+    ASSERT(attn_val->ndim() == 3, "SelfAttention: attn_val must be 3D tensor [qlen, nhead, head_dim].");
+    
+    size_t qlen = q->shape()[0];
+    size_t nhead = q->shape()[1];
+    size_t head_dim = q->shape()[2];
+    size_t kvlen = k->shape()[0];
+    size_t nkvhead = k->shape()[1];
+    
+    // Validate shapes
+    ASSERT(k->shape()[0] == v->shape()[0], "SelfAttention: k and v must have same kvlen.");
+    ASSERT(k->shape()[1] == v->shape()[1], "SelfAttention: k and v must have same nkvhead.");
+    ASSERT(k->shape()[2] == head_dim, "SelfAttention: k head_dim must match q head_dim.");
+    ASSERT(v->shape()[2] == head_dim, "SelfAttention: v head_dim must match q head_dim.");
+    ASSERT(attn_val->shape()[0] == qlen, "SelfAttention: attn_val qlen must match q qlen.");
+    ASSERT(attn_val->shape()[1] == nhead, "SelfAttention: attn_val nhead must match q nhead.");
+    ASSERT(attn_val->shape()[2] == head_dim, "SelfAttention: attn_val head_dim must match q head_dim.");
+    ASSERT(nhead % nkvhead == 0, "SelfAttention: nhead must be divisible by nkvhead for GQA.");
+    ASSERT(kvlen >= qlen, "SelfAttention: kvlen must be >= qlen.");
+    
+    if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                   scale, attn_val->dtype(), qlen, kvlen, nhead, nkvhead, head_dim);
+    }
+
+    llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId());
+
+    switch (attn_val->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                   scale, attn_val->dtype(), qlen, kvlen, nhead, nkvhead, head_dim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, attn_val->dtype(), qlen, kvlen,
+                                      nhead, nkvhead, head_dim, llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp
new file mode 100644
index 000000000..e34414acd
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp
@@ -0,0 +1,51 @@
+#include "swiglu_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+
+// SwiGLU: out = up * silu(gate)
+// where silu(x) = x * sigmoid(x) = x / (1 + exp(-x))
+// Formula: out_i = up_i * gate_i / (1 + exp(-gate_i))
+
+template <typename T>
+void swiglu_(T *out, const T *gate, const T *up, size_t numel) {
+    for (size_t i = 0; i < numel; ++i) {
+        // Convert to double for numerical stability
+        double gate_val = llaisys::utils::cast<double>(gate[i]);
+        double up_val = llaisys::utils::cast<double>(up[i]);
+        
+        // silu(gate) = gate / (1 + exp(-gate))
+        double silu_gate = gate_val / (1.0 + std::exp(-gate_val));
+        
+        // out = up * silu(gate)
+        double out_val = up_val * silu_gate;
+        
+        out[i] = llaisys::utils::cast<T>(out_val);
+    }
+}
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up,
+            llaisysDataType_t type, size_t numel) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return swiglu_(reinterpret_cast<float *>(out),
+                       reinterpret_cast<const float *>(gate),
+                       reinterpret_cast<const float *>(up),
+                       numel);
+    case LLAISYS_DTYPE_BF16:
+        return swiglu_(reinterpret_cast<llaisys::bf16_t *>(out),
+                       reinterpret_cast<const llaisys::bf16_t *>(gate),
+                       reinterpret_cast<const llaisys::bf16_t *>(up),
+                       numel);
+    case LLAISYS_DTYPE_F16:
+        return swiglu_(reinterpret_cast<llaisys::fp16_t *>(out),
+                       reinterpret_cast<const llaisys::fp16_t *>(gate),
+                       reinterpret_cast<const llaisys::fp16_t *>(up),
+                       numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp
new file mode 100644
index 000000000..31debbec1
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, 
+            llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu
new file mode 100644
index 000000000..928dbd5ff
--- /dev/null
+++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu
@@ -0,0 +1,63 @@
+#include "swiglu_nvidia.cuh"
+
+#include "../../../device/nvidia/cuda_utils.cuh"
+
+#include <algorithm>
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+namespace {
+
+template <typename T>
+__global__ void swigluKernel(T *out, const T *gate, const T *up, size_t numel) {
+    const size_t stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+    for (size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x; idx < numel; idx += stride) {
+        const float gate_val = llaisys::device::nvidia::scalarToFloat<T>(gate[idx]);
+        const float up_val = llaisys::device::nvidia::scalarToFloat<T>(up[idx]);
+        const float silu = gate_val / (1.0f + expf(-gate_val));
+        out[idx] = llaisys::device::nvidia::floatToScalar<T>(up_val * silu);
+    }
+}
+
+} // namespace
+
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel, llaisysStream_t stream) {
+    if (numel == 0) {
+        return;
+    }
+
+    constexpr int threads = 256;
+    const int max_blocks = 4096;
+    const int blocks = static_cast<int>(std::min<size_t>((numel + threads - 1) / threads, static_cast<size_t>(max_blocks)));
+    const cudaStream_t cuda_stream = llaisys::device::nvidia::toCudaStream(stream);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        swigluKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(gate),
+            reinterpret_cast<const float *>(up),
+            numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        swigluKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(gate),
+            reinterpret_cast<const llaisys::fp16_t *>(up),
+            numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        swigluKernel<<<blocks, threads, 0, cuda_stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(gate),
+            reinterpret_cast<const llaisys::bf16_t *>(up),
+            numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cuh b/src/ops/swiglu/nvidia/swiglu_nvidia.cuh
new file mode 100644
index 000000000..73d853457
--- /dev/null
+++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel, llaisysStream_t stream);
+}
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc97..521e54d6c 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,37 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/swiglu_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/swiglu_nvidia.cuh"
+#endif
+
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, gate, up);
+    CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape());
+    CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype());
+    ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(),
+           "SwiGLU: all tensors must be contiguous.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel());
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel(),
+                              llaisys::core::context().runtime().stream());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb65..a0e538aaa 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -1,10 +1,14 @@
 #include "tensor.hpp"
 
 #include "../utils.hpp"
+#include "llaisys.h"
 
+#include <cstddef>
 #include <cstring>
 #include <numeric>
 #include <sstream>
+#include <stdexcept>
+#include <vector>
 
 namespace llaisys {
 
@@ -164,27 +168,92 @@ void Tensor::debug() const {
 }
 
 bool Tensor::isContiguous() const {
-    TO_BE_IMPLEMENTED();
+    // TO_BE_IMPLEMENTED();
+    // row-major order, contiguous in memory
+    if (ndim() == 0) { // ndim == 0 means scalar, contiguous in memory
+        return true;
+    }
+    ptrdiff_t expected_stride = 1;
+    for (size_t i = ndim(); i > 0; i--) {
+        // be attentive to the index of the dimension, overflow problem
+        size_t dim = i - 1;
+        if (strides()[dim] != expected_stride) {
+            return false;
+        }
+        expected_stride *= static_cast<ptrdiff_t>(shape()[dim]);
+    }
     return true;
 }
 
 tensor_t Tensor::permute(const std::vector<size_t> &order) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // TO_BE_IMPLEMENTED();
+    std::vector<size_t> new_shape(order.size(), 0);
+    std::vector<ptrdiff_t> new_strides(order.size(), 0);
+    for (size_t i = 0; i < order.size(); i++) {
+        size_t dim = order[i];
+        new_shape[i] = shape()[dim];
+        new_strides[i] = strides()[dim];
+    }
+    TensorMeta new_meta{dtype(), new_shape, new_strides};
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, _offset));
 }
 
 tensor_t Tensor::view(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // TO_BE_IMPLEMENTED();
+    // 1.must be contiguous
+    if (!isContiguous()) {
+        throw std::runtime_error("Can't view non-contiguous tensor.");
+    }
+
+    // 2.new shape must have the same number of elements as the original tensor
+    size_t new_numel = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());
+    if (new_numel != numel()) {
+        throw std::runtime_error("Can't view tensor with different number of elements.");
+    }
+
+    // 3.calculate new strides
+    size_t new_ndim = shape.size();
+    std::vector<ptrdiff_t> new_strides(new_ndim);
+    ptrdiff_t stride = 1;
+    for (size_t i = new_ndim; i > 0; i--) {
+        size_t dim = i - 1;
+        new_strides[dim] = stride;
+        stride *= static_cast<ptrdiff_t>(shape[dim]);
+    }
+
+    // 4.create new tensor
+    TensorMeta new_meta{dtype(), shape, new_strides};
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, _offset));
 }
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // TO_BE_IMPLEMENTED();
+    if (dim >= ndim() || start >= shape()[dim] || end > shape()[dim] || start >= end) {
+        throw std::runtime_error("Can't slice out of range.");
+    }
+    std::vector<size_t> new_shape = shape();
+    std::vector<ptrdiff_t> new_strides = strides();
+
+    new_shape[dim] = end - start;
+    TensorMeta new_meta{dtype(), new_shape, strides()};
+    size_t new_offset = _offset + start * strides()[dim] * elementSize();
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, new_offset));
 }
 
 void Tensor::load(const void *src_) {
-    TO_BE_IMPLEMENTED();
+    // TO_BE_IMPLEMENTED();
+    size_t total_bytes = numel() * elementSize();
+    core::context().setDevice(deviceType(), deviceId());
+
+    if (!isContiguous()) {
+        throw std::runtime_error("Can't load into non-contiguous tensor.");
+    }
+
+    if (deviceType() == LLAISYS_DEVICE_CPU) {
+        std::memcpy(data(), src_, total_bytes);
+    } else {
+        core::context().runtime().api()->memcpy_sync(data(), src_, total_bytes, LLAISYS_MEMCPY_H2D);
+    }
 }
 
 tensor_t Tensor::contiguous() const {
diff --git a/test/ops/self_attention.py b/test/ops/self_attention.py
index a042b51be..abf3927a8 100644
--- a/test/ops/self_attention.py
+++ b/test/ops/self_attention.py
@@ -15,7 +15,7 @@ def torch_self_attention(attn_val, query, key, value, scale):
     L, S = query.size(-2), key.size(-2)
     attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
 
-    temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=S-L)
+    temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).tril(diagonal=S-L)
     attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
     attn_bias.to(query.dtype)
 
diff --git a/test/test_dist.py b/test/test_dist.py
new file mode 100644
index 000000000..556b94735
--- /dev/null
+++ b/test/test_dist.py
@@ -0,0 +1,95 @@
+import argparse
+import os
+import tempfile
+import uuid
+
+import llaisys
+import torch
+import torch.multiprocessing as mp
+
+
+def tensor_from_torch(torch_tensor: torch.Tensor, rank: int) -> llaisys.Tensor:
+    tensor = llaisys.Tensor(
+        torch_tensor.shape,
+        dtype=llaisys.DataType.F32,
+        device=llaisys.DeviceType.NVIDIA,
+        device_id=rank,
+    )
+    api = llaisys.RuntimeAPI(llaisys.DeviceType.NVIDIA)
+    api.set_device(rank)
+    api.memcpy_sync(
+        tensor.data_ptr(),
+        torch_tensor.data_ptr(),
+        torch_tensor.numel() * torch_tensor.element_size(),
+        llaisys.MemcpyKind.D2D,
+    )
+    return tensor
+
+
+def torch_from_tensor(tensor: llaisys.Tensor, shape, rank: int) -> torch.Tensor:
+    result = torch.empty(shape, dtype=torch.float32, device=torch.device(f"cuda:{rank}"))
+    api = llaisys.RuntimeAPI(llaisys.DeviceType.NVIDIA)
+    api.set_device(rank)
+    api.memcpy_sync(
+        result.data_ptr(),
+        tensor.data_ptr(),
+        result.numel() * result.element_size(),
+        llaisys.MemcpyKind.D2D,
+    )
+    return result
+
+
+def worker(rank: int, world_size: int) -> None:
+    api = llaisys.RuntimeAPI(llaisys.DeviceType.NVIDIA)
+    api.set_device(rank)
+    dist = llaisys.DistributedContext()
+    dist.init(rank, world_size)
+
+    local = torch.full((4,), float(rank + 1), dtype=torch.float32, device=torch.device(f"cuda:{rank}"))
+    reduced = tensor_from_torch(local.clone(), rank)
+    dist.all_reduce(reduced)
+    reduced_result = torch_from_tensor(reduced, local.shape, rank)
+    expected_reduce = torch.full_like(local, float(world_size * (world_size + 1) // 2))
+    torch.testing.assert_close(reduced_result, expected_reduce)
+
+    gathered = dist.all_gather(tensor_from_torch(local, rank))
+    gathered_result = torch_from_tensor(gathered, (world_size, 4), rank)
+    expected_gather = torch.stack(
+        [
+            torch.full((4,), float(peer_rank + 1), dtype=torch.float32, device=torch.device(f"cuda:{rank}"))
+            for peer_rank in range(world_size)
+        ]
+    )
+    torch.testing.assert_close(gathered_result, expected_gather)
+
+    broadcast_source = (
+        torch.tensor([10.0, 11.0, 12.0, 13.0], dtype=torch.float32, device=torch.device(f"cuda:{rank}"))
+        if rank == 0
+        else torch.zeros((4,), dtype=torch.float32, device=torch.device(f"cuda:{rank}"))
+    )
+    broadcast_tensor = tensor_from_torch(broadcast_source, rank)
+    dist.broadcast(broadcast_tensor, root=0)
+    broadcast_result = torch_from_tensor(broadcast_tensor, (4,), rank)
+    expected_broadcast = torch.tensor([10.0, 11.0, 12.0, 13.0], dtype=torch.float32, device=torch.device(f"cuda:{rank}"))
+    torch.testing.assert_close(broadcast_result, expected_broadcast)
+
+    dist.barrier()
+    dist.finalize()
+    api.device_synchronize()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="nvidia", choices=["nvidia"], type=str)
+    parser.add_argument("--world-size", default=8, type=int)
+    args = parser.parse_args()
+
+    assert args.device == "nvidia"
+    assert torch.cuda.device_count() >= args.world_size
+
+    os.environ["LLAISYS_DIST_BOOTSTRAP_PATH"] = os.path.join(
+        tempfile.gettempdir(), f"llaisys_nccl_{uuid.uuid4().hex}.bin"
+    )
+    mp.spawn(worker, args=(args.world_size,), nprocs=args.world_size, join=True)
+
+    print("\033[92mDistributed test passed!\033[0m\n")
diff --git a/test/test_infer.py b/test/test_infer.py
index 59d06b874..7c3ebb203 100644
--- a/test/test_infer.py
+++ b/test/test_infer.py
@@ -55,7 +55,7 @@ def hf_infer(
 
 
 def load_llaisys_model(model_path, device_name):
-    model = llaisys.models.Qwen2(model_path, llaisys_device(device_name))
+    model = llaisys.models.Qwen2(model_path, llaisys_device(device_name), device_id=0, rank=0, world_size=1)
     return model
 
 
diff --git a/test/test_infer_dist.py b/test/test_infer_dist.py
new file mode 100644
index 000000000..1c5aaa38b
--- /dev/null
+++ b/test/test_infer_dist.py
@@ -0,0 +1,107 @@
+import argparse
+import gc
+import os
+import tempfile
+import time
+import uuid
+
+import torch
+import torch.multiprocessing as mp
+from transformers import AutoTokenizer
+
+import llaisys
+
+
+def build_input_tokens(tokenizer, prompt: str):
+    input_content = tokenizer.apply_chat_template(
+        conversation=[{"role": "user", "content": prompt}],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    return tokenizer.encode(input_content)
+
+
+def worker(rank: int, world_size: int, model_path: str, inputs, max_new_tokens: int, result_holder):
+    dist = llaisys.DistributedContext()
+    dist.init(rank, world_size)
+
+    load_start = time.time()
+    model = llaisys.models.Qwen2(
+        model_path,
+        llaisys.DeviceType.NVIDIA,
+        device_id=rank,
+        rank=rank,
+        world_size=world_size,
+    )
+    load_elapsed = time.time() - load_start
+
+    dist.barrier()
+    generate_start = time.time()
+    outputs = model.generate(inputs, max_new_tokens=max_new_tokens, top_k=1, top_p=1.0, temperature=1.0)
+    dist.barrier()
+    generate_elapsed = time.time() - generate_start
+
+    if rank == 0:
+        result_holder["tokens"] = outputs
+        result_holder["load_latency"] = load_elapsed
+        result_holder["generate_latency"] = generate_elapsed
+
+    del model
+    dist.finalize()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="nvidia", choices=["nvidia"], type=str)
+    parser.add_argument("--model", required=True, type=str)
+    parser.add_argument("--prompt", default="Who are you?", type=str)
+    parser.add_argument("--max_steps", default=32, type=int)
+    parser.add_argument("--world-size", default=8, type=int)
+    parser.add_argument("--test", action="store_true")
+    args = parser.parse_args()
+
+    assert torch.cuda.device_count() >= args.world_size
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    inputs = build_input_tokens(tokenizer, args.prompt)
+
+    single_load_start = time.time()
+    single_model = llaisys.models.Qwen2(
+        args.model,
+        llaisys.DeviceType.NVIDIA,
+        device_id=0,
+        rank=0,
+        world_size=1,
+    )
+    single_load_latency = time.time() - single_load_start
+    start_time = time.time()
+    single_tokens = single_model.generate(inputs, max_new_tokens=args.max_steps, top_k=1, top_p=1.0, temperature=1.0)
+    single_generate_latency = time.time() - start_time
+    del single_model
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    os.environ["LLAISYS_DIST_BOOTSTRAP_PATH"] = os.path.join(
+        tempfile.gettempdir(), f"llaisys_qwen2_tp_{uuid.uuid4().hex}.bin"
+    )
+
+    manager = mp.Manager()
+    result_holder = manager.dict()
+    mp.spawn(worker, args=(args.world_size, args.model, inputs, args.max_steps, result_holder), nprocs=args.world_size, join=True)
+
+    dist_tokens = list(result_holder["tokens"])
+    dist_load_latency = float(result_holder["load_latency"])
+    dist_generate_latency = float(result_holder["generate_latency"])
+
+    print("Single-card tokens:")
+    print(single_tokens)
+    print(f"Single-card load latency: {single_load_latency:.2f}s")
+    print(f"Single-card generate latency: {single_generate_latency:.2f}s")
+    print("\nDistributed tokens:")
+    print(dist_tokens)
+    print(f"Distributed load latency: {dist_load_latency:.2f}s")
+    print(f"Distributed generate latency: {dist_generate_latency:.2f}s")
+
+    if args.test:
+        assert dist_tokens == single_tokens
+        print("\033[92mDistributed inference test passed!\033[0m\n")
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a95..7fd7cd651 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -37,6 +37,9 @@ target("llaisys-device")
     set_kind("static")
     add_deps("llaisys-utils")
     add_deps("llaisys-device-cpu")
+    if has_config("nv-gpu") then
+        add_deps("llaisys-device-nvidia")
+    end
 
     set_languages("cxx17")
     set_warnings("all", "error")
@@ -83,6 +86,9 @@ target_end()
 target("llaisys-ops")
     set_kind("static")
     add_deps("llaisys-ops-cpu")
+    if has_config("nv-gpu") then
+        add_deps("llaisys-ops-nvidia")
+    end
 
     set_languages("cxx17")
     set_warnings("all", "error")
@@ -95,6 +101,22 @@ target("llaisys-ops")
     on_install(function (target) end)
 target_end()
 
+target("llaisys-models")
+    set_kind("static")
+    add_deps("llaisys-tensor")
+    add_deps("llaisys-ops")
+
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+    end
+    
+    add_files("src/models/*/*.cpp")
+
+    on_install(function (target) end)
+target_end()
+
 target("llaisys")
     set_kind("shared")
     add_deps("llaisys-utils")
@@ -102,6 +124,7 @@ target("llaisys")
     add_deps("llaisys-core")
     add_deps("llaisys-tensor")
     add_deps("llaisys-ops")
+    add_deps("llaisys-models")
 
     set_languages("cxx17")
     set_warnings("all", "error")
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
new file mode 100644
index 000000000..5c87bf794
--- /dev/null
+++ b/xmake/nvidia.lua
@@ -0,0 +1,35 @@
+local function config_nvidia_target()
+    add_rules("cuda")
+    set_values("cuda.build.devlink", true)
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    add_cugencodes("native")
+    add_cuflags("--extended-lambda", {force = true})
+    add_cuflags("--expt-relaxed-constexpr", {force = true})
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+        add_cuflags("-Xcompiler=-fPIC", {force = true})
+        add_culdflags("-Xcompiler -fPIC", {force = true})
+    end
+    add_links("cudart", "cublas", "nccl")
+end
+
+target("llaisys-device-nvidia")
+    set_kind("static")
+    config_nvidia_target()
+    add_files("../src/device/nvidia/*.cu")
+
+    on_install(function (target) end)
+target_end()
+
+target("llaisys-ops-nvidia")
+    set_kind("static")
+    add_deps("llaisys-tensor")
+    config_nvidia_target()
+
+    for _, file in ipairs(os.files("../src/ops/*/nvidia/*.cu")) do
+        add_files(file)
+    end
+
+    on_install(function (target) end)
+target_end()