From e4c31c0c21d19aa78739a7fb68b5b882284ab9ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= <mengyijia49@gmail.com>
Date: Mon, 26 Jan 2026 16:09:56 +0800
Subject: [PATCH 1/9] =?UTF-8?q?=E5=AE=8C=E6=88=90=20Assignment=20#1:=20?=
 =?UTF-8?q?=E5=AE=9E=E7=8E=B0=20Tensor=20=E5=85=83=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E6=93=8D=E4=BD=9C=20(load,=20isContiguous,=20view,=20permute,?=
 =?UTF-8?q?=20slice)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/tensor/tensor.cpp | 58 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 9 deletions(-)
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb65..1d4daa48b 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -164,27 +164,67 @@ void Tensor::debug() const {
 }
 
 bool Tensor::isContiguous() const {
-    TO_BE_IMPLEMENTED();
+    // 逻辑：从最后一个维度开始检查
+    // 每一个维度的步长（stride）应该等于它后方所有维度大小（shape）的乘积
+    size_t expected_stride = 1;
+    for (int i = (int)ndim() - 1; i >= 0; --i) {
+        if (shape()[i] > 1) { // 维度大小为 1 的可以跳过，因为它不影响连续性
+            if (strides()[i] != (ptrdiff_t)expected_stride) {
+                return false;
+            }
+            expected_stride *= shape()[i];
+        }
+    }
     return true;
 }
 
 tensor_t Tensor::permute(const std::vector<size_t> &order) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    std::vector<size_t> new_shape;
+    std::vector<ptrdiff_t> new_strides;
+    for (auto i : order) {
+        new_shape.push_back(_meta.shape[i]);
+        new_strides.push_back(_meta.strides[i]);
+    }
+    return std::shared_ptr<Tensor>(new Tensor({dtype(), new_shape, new_strides}, _storage, _offset));
 }
 
 tensor_t Tensor::view(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // 检查元素总数是否相等
+    size_t new_numel = 1;
+    for (auto s : shape) new_numel *= s;
+    if (new_numel != this->numel()) throw std::runtime_error("view: numel mismatch");
+
+    // 基础作业要求：非连续张量不允许 view
+    if (!this->isContiguous()) throw std::runtime_error("view: tensor is not contiguous");
+
+    // 计算新步长
+    std::vector<ptrdiff_t> new_strides(shape.size());
+    size_t st = 1;
+    for (int i = (int)shape.size() - 1; i >= 0; --i) {
+        new_strides[i] = st;
+        st *= shape[i];
+    }
+    return std::shared_ptr<Tensor>(new Tensor({dtype(), shape, new_strides}, _storage, _offset));
 }
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // 新的偏移量计算：原偏移 + 开始位置 * 步长 * 单个元素字节数
+    size_t new_offset = _offset + start * _meta.strides[dim] * elementSize();
+    std::vector<size_t> new_shape = _meta.shape;
+    new_shape[dim] = end - start;
+    return std::shared_ptr<Tensor>(new Tensor({dtype(), new_shape, _meta.strides}, _storage, new_offset));
 }
 
 void Tensor::load(const void *src_) {
-    TO_BE_IMPLEMENTED();
+    size_t bytes = this->numel() * this->elementSize();
+    // 设置当前设备并执行拷贝
+    core::context().setDevice(this->deviceType(), this->deviceId());
+    core::context().runtime().api()->memcpy_sync(
+        this->data(), // 目标地址：由 data() 计算得出（含 offset）
+        src_,         // 源地址
+        bytes, 
+        LLAISYS_MEMCPY_H2D // Host to Device
+    );
 }
 
 tensor_t Tensor::contiguous() const {
@@ -202,4 +242,4 @@ tensor_t Tensor::to(llaisysDeviceType_t device_type, int device) const {
     return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
 }
 
-} // namespace llaisys
+} // namespace llaisys
\ No newline at end of file

From e5e8af7fc6f88712b2062aabd33d7ba2dc8fe32a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= <mengyijia49@gmail.com>
Date: Mon, 26 Jan 2026 16:19:32 +0800
Subject: [PATCH 2/9] =?UTF-8?q?=E8=A1=A5=E5=85=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/tensor/tensor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 1d4daa48b..4bf89c00d 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -209,6 +209,7 @@ tensor_t Tensor::view(const std::vector<size_t> &shape) const {
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
     // 新的偏移量计算：原偏移 + 开始位置 * 步长 * 单个元素字节数
+    //
     size_t new_offset = _offset + start * _meta.strides[dim] * elementSize();
     std::vector<size_t> new_shape = _meta.shape;
     new_shape[dim] = end - start;

From 1a37e8580aaca0cf1d5bd5509fff36b2cca2307b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= <mengyijia49@gmail.com>
Date: Tue, 27 Jan 2026 16:30:49 +0800
Subject: [PATCH 3/9] feat: implement all core operators for Task-2 (Argmax,
 Embedding, Linear, RMSNorm, RoPE, Self-Attention, SwiGLU, Rearrange)

---
 src/ops/argmax/cpu/argmax_cpu.cpp             | 45 ++++++++++
 src/ops/argmax/cpu/argmax_cpu.hpp             |  9 ++
 src/ops/argmax/op.cpp                         | 18 +++-
 src/ops/embedding/cpu/embedding_cpu.cpp       | 38 ++++++++
 src/ops/embedding/cpu/embedding_cpu.hpp       |  8 ++
 src/ops/embedding/op.cpp                      | 27 +++++-
 src/ops/linear/cpu/linear_cpu.cpp             | 43 +++++++++
 src/ops/linear/cpu/linear_cpu.hpp             |  8 ++
 src/ops/linear/op.cpp                         | 29 ++++++-
 src/ops/rearrange/cpu/rearrange_cpu.cpp       | 52 +++++++++++
 src/ops/rearrange/cpu/rearrange_cpu.hpp       | 11 +++
 src/ops/rearrange/op.cpp                      | 17 +++-
 src/ops/rms_norm/cpu/rms_norm_cpu.cpp         | 47 ++++++++++
 src/ops/rms_norm/cpu/rms_norm_cpu.hpp         |  8 ++
 src/ops/rms_norm/op.cpp                       | 24 ++++-
 src/ops/rope/cpu/rope_cpu.cpp                 | 59 +++++++++++++
 src/ops/rope/cpu/rope_cpu.hpp                 |  8 ++
 src/ops/rope/op.cpp                           | 26 +++++-
 .../self_attention/cpu/self_attention_cpu.cpp | 87 +++++++++++++++++++
 .../self_attention/cpu/self_attention_cpu.hpp |  9 ++
 src/ops/self_attention/op.cpp                 | 26 +++++-
 src/ops/swiglu/cpu/swiglu_cpu.cpp             | 36 ++++++++
 src/ops/swiglu/cpu/swiglu_cpu.hpp             |  8 ++
 src/ops/swiglu/op.cpp                         | 26 +++++-
 24 files changed, 653 insertions(+), 16 deletions(-)
 create mode 100644 src/ops/argmax/cpu/argmax_cpu.cpp
 create mode 100644 src/ops/argmax/cpu/argmax_cpu.hpp
 create mode 100644 src/ops/embedding/cpu/embedding_cpu.cpp
 create mode 100644 src/ops/embedding/cpu/embedding_cpu.hpp
 create mode 100644 src/ops/linear/cpu/linear_cpu.cpp
 create mode 100644 src/ops/linear/cpu/linear_cpu.hpp
 create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.cpp
 create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.hpp
 create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.cpp
 create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.hpp
 create mode 100644 src/ops/rope/cpu/rope_cpu.cpp
 create mode 100644 src/ops/rope/cpu/rope_cpu.hpp
 create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.cpp
 create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.hpp
 create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.cpp
 create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.hpp

diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp
new file mode 100644
index 000000000..1d5128ca0
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.cpp
@@ -0,0 +1,45 @@
+#include "argmax_cpu.hpp"
+#include "../../../utils.hpp"
+#include <cstdint>
+
+template <typename T>
+void argmax_(int64_t *max_idx, T *max_val, const T *vals, size_t numel) {
+    if (numel == 0) return;
+
+    // 初始化：假设第一个元素就是最大的
+    T current_max = vals[0];
+    int64_t current_idx = 0;
+
+    for (size_t i = 1; i < numel; i++) {
+        // 使用 cast 转为 float 比较，以支持 fp16/bf16
+        if (llaisys::utils::cast<float>(vals[i]) > llaisys::utils::cast<float>(current_max)) {
+            current_max = vals[i];
+            current_idx = static_cast<int64_t>(i);
+        }
+    }
+
+    *max_val = current_max;
+    *max_idx = current_idx;
+}
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, 
+            llaisysDataType_t type, size_t numel) {
+    // max_idx 固定为 I64
+    int64_t *idx_ptr = reinterpret_cast<int64_t *>(max_idx);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return argmax_(idx_ptr, reinterpret_cast<float *>(max_val), 
+                       reinterpret_cast<const float *>(vals), numel);
+    case LLAISYS_DTYPE_BF16:
+        return argmax_(idx_ptr, reinterpret_cast<llaisys::bf16_t *>(max_val),
+                       reinterpret_cast<const llaisys::bf16_t *>(vals), numel);
+    case LLAISYS_DTYPE_F16:
+        return argmax_(idx_ptr, reinterpret_cast<llaisys::fp16_t *>(max_val),
+                       reinterpret_cast<const llaisys::fp16_t *>(vals), numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp
new file mode 100644
index 000000000..e9972f92f
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+// max_idx 存储索引，通常是 int64_t；max_val 和 vals 类型相同
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, 
+            llaisysDataType_t type, size_t numel);
+}
\ No newline at end of file
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d426..8cafa32de 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,21 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/argmax_cpu.hpp" // 记得包含刚才写的新头文件
 
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(max_idx, max_val, vals);
+    // 注意：根据任务要求，max_idx 和 max_val 是包含单元素的 1D 张量
+    ASSERT(max_idx->numel() == 1 && max_val->numel() == 1, "Argmax: outputs must have 1 element.");
+    ASSERT(vals->isContiguous(), "Argmax: input tensor must be contiguous.");
+
+    if (vals->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), 
+                           vals->dtype(), vals->numel());
+    }
+
+    // 如果有其他设备，可以在这里继续 switch...
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+} // namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp
new file mode 100644
index 000000000..1768d50d3
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.cpp
@@ -0,0 +1,38 @@
+#include "embedding_cpu.hpp"
+#include "../../../utils.hpp"
+#include <cstring>
+#include <cstdint>
+
+template <typename T>
+void embedding_(T *out, const int64_t *index, const T *weight, size_t num_indices, size_t embedding_dim) {
+    for (size_t i = 0; i < num_indices; ++i) {
+        int64_t idx = index[i];
+        // 将 weight 中的第 idx 行复制到 out 中的第 i 行
+        const T *src = weight + (idx * embedding_dim);
+        T *dst = out + (i * embedding_dim);
+        std::memcpy(dst, src, embedding_dim * sizeof(T));
+    }
+}
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight,
+               llaisysDataType_t type, size_t num_indices, size_t embedding_dim) {
+    
+    // index 固定为 Int64 (int64_t)
+    const int64_t *idx_ptr = reinterpret_cast<const int64_t *>(index);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return embedding_(reinterpret_cast<float *>(out), idx_ptr, 
+                          reinterpret_cast<const float *>(weight), num_indices, embedding_dim);
+    case LLAISYS_DTYPE_BF16:
+        return embedding_(reinterpret_cast<llaisys::bf16_t *>(out), idx_ptr,
+                          reinterpret_cast<const llaisys::bf16_t *>(weight), num_indices, embedding_dim);
+    case LLAISYS_DTYPE_F16:
+        return embedding_(reinterpret_cast<llaisys::fp16_t *>(out), idx_ptr,
+                          reinterpret_cast<const llaisys::fp16_t *>(weight), num_indices, embedding_dim);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp
new file mode 100644
index 000000000..c5cbc5461
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight,
+               llaisysDataType_t type, size_t num_indices, size_t embedding_dim);
+}
\ No newline at end of file
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d06..1008838f0 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,30 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/embedding_cpu.hpp"
 
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, index, weight);
+    
+    // 任务要求：index 必须是 Int64 类型
+    ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "Embedding: index must be Int64.");
+    // 确保 weight 和 out 的数据类型一致
+    CHECK_SAME_DTYPE(out->dtype(), weight->dtype());
+    
+    // 简单的连续性检查
+    ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), 
+           "Embedding: all tensors must be contiguous.");
+
+    // 获取维度信息
+    size_t num_indices = index->numel();
+    size_t embedding_dim = weight->shape().back(); // weight 的最后一维是词向量长度
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::embedding(out->data(), index->data(), weight->data(), 
+                              out->dtype(), num_indices, embedding_dim);
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+} // namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp
new file mode 100644
index 000000000..77ad1c7b9
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.cpp
@@ -0,0 +1,43 @@
+#include "linear_cpu.hpp"
+#include "../../../utils.hpp"
+
+template <typename T>
+void linear_(T *out, const T *in, const T *weight, const T *bias, size_t M, size_t N, size_t K) {
+    for (size_t i = 0; i < M; ++i) {        // 遍历 X 的行
+        for (size_t j = 0; j < N; ++j) {    // 遍历 W 的行 (即 Y 的列)
+            float sum = 0.0f;
+            for (size_t k = 0; k < K; ++k) { // 内积计算
+                // X[i, k] * W[j, k] (因为是 W^T，所以 W 也是取第 j 行第 k 列)
+                float x_val = llaisys::utils::cast<float>(in[i * K + k]);
+                float w_val = llaisys::utils::cast<float>(weight[j * K + k]);
+                sum += x_val * w_val;
+            }
+
+            // 处理可选的 bias: b[j]
+            if (bias) {
+                sum += llaisys::utils::cast<float>(bias[j]);
+            }
+
+            out[i * N + j] = llaisys::utils::cast<T>(sum);
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias,
+            llaisysDataType_t type, size_t M, size_t N, size_t K) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return linear_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in),
+                       reinterpret_cast<const float *>(weight), reinterpret_cast<const float *>(bias), M, N, K);
+    case LLAISYS_DTYPE_BF16:
+        return linear_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in),
+                       reinterpret_cast<const llaisys::bf16_t *>(weight), reinterpret_cast<const llaisys::bf16_t *>(bias), M, N, K);
+    case LLAISYS_DTYPE_F16:
+        return linear_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in),
+                       reinterpret_cast<const llaisys::fp16_t *>(weight), reinterpret_cast<const llaisys::fp16_t *>(bias), M, N, K);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp
new file mode 100644
index 000000000..7c9652b23
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias,
+            llaisysDataType_t type, size_t M, size_t N, size_t K);
+}
\ No newline at end of file
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f8655..5222917c3 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,32 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/linear_cpu.hpp"
 
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, weight);
+    if (bias) CHECK_SAME_DEVICE(out, bias);
+
+    // 形状校验
+    size_t M = in->shape()[0];
+    size_t K = in->shape()[1];
+    size_t N = weight->shape()[0];
+    
+    ASSERT(weight->shape()[1] == K, "Linear: K dimension mismatch.");
+    ASSERT(out->shape()[0] == M && out->shape()[1] == N, "Linear: output shape mismatch.");
+
+    // 连续性校验
+    ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), 
+           "Linear: all main tensors must be contiguous.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        // 注意：bias 可能为 nullptr，直接传入 data() 即可（底层做了 if(bias) 判断）
+        const std::byte *bias_ptr = bias ? bias->data() : nullptr;
+        return cpu::linear(out->data(), in->data(), weight->data(), bias_ptr,
+                           out->dtype(), M, N, K);
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+} // namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp
new file mode 100644
index 000000000..eadb7d4b8
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp
@@ -0,0 +1,52 @@
+#include "rearrange_cpu.hpp"
+#include "../../../utils.hpp"
+
+template <typename T>
+void rearrange_(T *out, const T *in, const std::vector<size_t>& shape, 
+                const std::vector<ptrdiff_t>& in_strides, 
+                const std::vector<ptrdiff_t>& out_strides) {
+    size_t rank = shape.size();
+    size_t total_elements = 1;
+    for (auto s : shape) total_elements *= s;
+
+    std::vector<size_t> current_idx(rank, 0);
+
+    for (size_t i = 0; i < total_elements; ++i) {
+        // 使用 ptrdiff_t 计算偏移，以匹配传入的步长类型
+        ptrdiff_t in_offset = 0;
+        ptrdiff_t out_offset = 0;
+        for (size_t d = 0; d < rank; ++d) {
+            in_offset += static_cast<ptrdiff_t>(current_idx[d]) * in_strides[d];
+            out_offset += static_cast<ptrdiff_t>(current_idx[d]) * out_strides[d];
+        }
+
+        out[out_offset] = in[in_offset];
+
+        for (int d = static_cast<int>(rank) - 1; d >= 0; --d) {
+            current_idx[d]++;
+            if (current_idx[d] < shape[d]) break;
+            current_idx[d] = 0;
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type,
+               const std::vector<size_t>& shape, 
+               const std::vector<ptrdiff_t>& in_strides, 
+               const std::vector<ptrdiff_t>& out_strides) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        rearrange_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in), shape, in_strides, out_strides);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        rearrange_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in), shape, in_strides, out_strides);
+        break;
+    case LLAISYS_DTYPE_F16:
+        rearrange_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in), shape, in_strides, out_strides);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp
new file mode 100644
index 000000000..9b94fb912
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp
@@ -0,0 +1,11 @@
+#pragma once
+#include "llaisys.h"
+#include <vector>
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type,
+               const std::vector<size_t>& shape, 
+               const std::vector<ptrdiff_t>& in_strides, 
+               const std::vector<ptrdiff_t>& out_strides);
+}
\ No newline at end of file
diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp
index 017a6ae59..5542d1345 100644
--- a/src/ops/rearrange/op.cpp
+++ b/src/ops/rearrange/op.cpp
@@ -1,7 +1,20 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rearrange_cpu.hpp"
 
 namespace llaisys::ops {
 void rearrange(tensor_t out, tensor_t in) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in);
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+    ASSERT(out->shape() == in->shape(), "Rearrange: shape mismatch.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        // 直接传递，类型现在匹配 std::vector<ptrdiff_t>
+        return cpu::rearrange(out->data(), in->data(), out->dtype(),
+                              in->shape(), in->strides(), out->strides());
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+}
\ No newline at end of file
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
new file mode 100644
index 000000000..b3902f8a0
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
@@ -0,0 +1,47 @@
+#include "rms_norm_cpu.hpp"
+#include "../../../utils.hpp"
+#include <cmath>
+
+template <typename T>
+void rms_norm_(T *out, const T *in, const T *weight, size_t M, size_t d, float eps) {
+    for (size_t i = 0; i < M; ++i) {
+        float sum_sq = 0.0f;
+        const T* row_in = in + i * d;
+        T* row_out = out + i * d;
+
+        // 1. 计算当前行元素的平方和
+        for (size_t j = 0; j < d; ++j) {
+            float val = llaisys::utils::cast<float>(row_in[j]);
+            sum_sq += val * val;
+        }
+
+        // 2. 计算 RMS (均方根)
+        float rms = std::sqrt(sum_sq / static_cast<float>(d) + eps);
+
+        // 3. 归一化并应用权重 W
+        for (size_t j = 0; j < d; ++j) {
+            float val = llaisys::utils::cast<float>(row_in[j]);
+            float w = llaisys::utils::cast<float>(weight[j]);
+            row_out[j] = llaisys::utils::cast<T>(w * val / rms);
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, 
+              llaisysDataType_t type, size_t M, size_t d, float eps) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rms_norm_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in),
+                         reinterpret_cast<const float *>(weight), M, d, eps);
+    case LLAISYS_DTYPE_BF16:
+        return rms_norm_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in),
+                         reinterpret_cast<const llaisys::bf16_t *>(weight), M, d, eps);
+    case LLAISYS_DTYPE_F16:
+        return rms_norm_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in),
+                         reinterpret_cast<const llaisys::fp16_t *>(weight), M, d, eps);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
new file mode 100644
index 000000000..96222e486
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, 
+              llaisysDataType_t type, size_t M, size_t d, float eps);
+}
\ No newline at end of file
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9d..9b0005857 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,27 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rms_norm_cpu.hpp"
 
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, weight);
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+
+    // 获取维度信息
+    // 假设输入 X 为 [M, d]，权重 W 为 [d]
+    size_t M = in->shape()[0];
+    size_t d = in->shape()[1];
+    
+    ASSERT(weight->numel() == d, "RMSNorm: weight size must match the last dimension of input.");
+    ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), 
+           "RMSNorm: all tensors must be contiguous.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rms_norm(out->data(), in->data(), weight->data(), 
+                             out->dtype(), M, d, eps);
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+} // namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp
new file mode 100644
index 000000000..b8e78a620
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.cpp
@@ -0,0 +1,59 @@
+#include "rope_cpu.hpp"
+#include "../../../utils.hpp"
+#include <cmath>
+#include <cstdint>
+
+template <typename T>
+void rope_(T *out, const T *in, const int64_t *pos_ids, 
+           size_t seqlen, size_t nhead, size_t d, float theta) {
+    size_t half_d = d / 2;
+
+    for (size_t i = 0; i < seqlen; ++i) {
+        float pos = static_cast<float>(pos_ids[i]); // pi
+        
+        for (size_t h = 0; h < nhead; ++h) {
+            for (size_t j = 0; j < half_d; ++j) {
+                // 计算角度 phi_i,j = pi / (theta^(2j/d))
+                float freq = pos / std::pow(theta, static_cast<float>(2 * j) / d);
+                float cos_phi = std::cos(freq);
+                float sin_phi = std::sin(freq);
+
+                // 获取 a 和 b。a 是前半部分，b 是对应的后半部分
+                // 索引定位：[seq_idx, head_idx, dim_idx]
+                size_t base_idx = i * (nhead * d) + h * d;
+                size_t idx_a = base_idx + j;
+                size_t idx_b = base_idx + j + half_d;
+
+                float a = llaisys::utils::cast<float>(in[idx_a]);
+                float b = llaisys::utils::cast<float>(in[idx_b]);
+
+                // 计算旋转结果
+                // a' = a*cos - b*sin
+                // b' = b*cos + a*sin
+                out[idx_a] = llaisys::utils::cast<T>(a * cos_phi - b * sin_phi);
+                out[idx_b] = llaisys::utils::cast<T>(b * cos_phi + a * sin_phi);
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids,
+          llaisysDataType_t type, size_t seqlen, size_t nhead, size_t d, float theta) {
+    const int64_t *p_ids = reinterpret_cast<const int64_t *>(pos_ids);
+    
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rope_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in), 
+                     p_ids, seqlen, nhead, d, theta);
+    case LLAISYS_DTYPE_BF16:
+        return rope_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in), 
+                     p_ids, seqlen, nhead, d, theta);
+    case LLAISYS_DTYPE_F16:
+        return rope_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in), 
+                     p_ids, seqlen, nhead, d, theta);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp
new file mode 100644
index 000000000..c6e92ecd9
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids,
+          llaisysDataType_t type, size_t seqlen, size_t nhead, size_t d, float theta);
+}
\ No newline at end of file
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64e..b56241b59 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,29 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rope_cpu.hpp"
 
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, pos_ids);
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+    ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be Int64.");
+
+    // 获取形状：[seqlen, nhead, d]
+    size_t seqlen = in->shape()[0];
+    size_t nhead = in->shape()[1];
+    size_t d = in->shape()[2];
+
+    ASSERT(pos_ids->numel() == seqlen, "RoPE: pos_ids length must match seqlen.");
+    ASSERT(d % 2 == 0, "RoPE: head dimension d must be even.");
+    ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), 
+           "RoPE: all tensors must be contiguous.");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rope(out->data(), in->data(), pos_ids->data(), 
+                         out->dtype(), seqlen, nhead, d, theta);
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+} // namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp
new file mode 100644
index 000000000..73116754e
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp
@@ -0,0 +1,87 @@
+#include "self_attention_cpu.hpp"
+#include "../../../utils.hpp"
+#include <cmath>
+#include <vector>
+#include <algorithm>
+
+template <typename T>
+void self_attention_(T *attn_val, const T *q, const T *k, const T *v,
+                     size_t seqlen, size_t total_len, size_t nhead, size_t nkvhead, 
+                     size_t d, size_t dv, float scale) {
+    size_t n_groups = nhead / nkvhead; // GQA 分组大小
+
+    for (size_t h = 0; h < nhead; ++h) {
+        size_t h_kv = h / n_groups; // 对应的 KV 头索引
+        
+        for (size_t i = 0; i < seqlen; ++i) {
+            std::vector<float> scores(total_len);
+            float max_score = -INFINITY;
+
+            // 1. 计算 QK^T * scale 并应用因果掩码
+            size_t q_idx_base = (i * nhead + h) * d;
+            for (size_t j = 0; j < total_len; ++j) {
+                // 因果掩码：当前位置 i 只能看到过去的位置 j
+                if (j > (total_len - seqlen + i)) {
+                    scores[j] = -INFINITY;
+                    continue;
+                }
+
+                float dot = 0.0f;
+                size_t k_idx_base = (j * nkvhead + h_kv) * d;
+                for (size_t c = 0; c < d; ++c) {
+                    dot += llaisys::utils::cast<float>(q[q_idx_base + c]) * llaisys::utils::cast<float>(k[k_idx_base + c]);
+                }
+                scores[j] = dot * scale;
+                max_score = std::max(max_score, scores[j]);
+            }
+
+            // 2. Softmax 归一化
+            float sum_exp = 0.0f;
+            for (size_t j = 0; j < total_len; ++j) {
+                if (scores[j] != -INFINITY) {
+                    scores[j] = std::exp(scores[j] - max_score);
+                    sum_exp += scores[j];
+                } else {
+                    scores[j] = 0.0f;
+                }
+            }
+            for (size_t j = 0; j < total_len; ++j) scores[j] /= sum_exp;
+
+            // 3. 计算 Softmax(A) * V
+            size_t out_idx_base = (i * nhead + h) * dv;
+            for (size_t cdv = 0; cdv < dv; ++cdv) {
+                float res = 0.0f;
+                for (size_t j = 0; j < total_len; ++j) {
+                    if (scores[j] > 0.0f) {
+                        size_t v_idx = (j * nkvhead + h_kv) * dv + cdv;
+                        res += scores[j] * llaisys::utils::cast<float>(v[v_idx]);
+                    }
+                }
+                attn_val[out_idx_base + cdv] = llaisys::utils::cast<T>(res);
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                    llaisysDataType_t type, size_t seqlen, size_t total_len, 
+                    size_t nhead, size_t nkvhead, size_t d, size_t dv, float scale) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return self_attention_(reinterpret_cast<float *>(attn_val), reinterpret_cast<const float *>(q),
+                                reinterpret_cast<const float *>(k), reinterpret_cast<const float *>(v),
+                                seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    case LLAISYS_DTYPE_BF16:
+        return self_attention_(reinterpret_cast<llaisys::bf16_t *>(attn_val), reinterpret_cast<const llaisys::bf16_t *>(q),
+                                reinterpret_cast<const llaisys::bf16_t *>(k), reinterpret_cast<const llaisys::bf16_t *>(v),
+                                seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    case LLAISYS_DTYPE_F16:
+        return self_attention_(reinterpret_cast<llaisys::fp16_t *>(attn_val), reinterpret_cast<const llaisys::fp16_t *>(q),
+                                reinterpret_cast<const llaisys::fp16_t *>(k), reinterpret_cast<const llaisys::fp16_t *>(v),
+                                seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp
new file mode 100644
index 000000000..eaf0a0150
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                    llaisysDataType_t type, size_t seqlen, size_t total_len, 
+                    size_t nhead, size_t nkvhead, size_t d, size_t dv, float scale);
+}
\ No newline at end of file
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d620142..b139964de 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,29 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/self_attention_cpu.hpp"
 
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(attn_val, q, k, v);
+    
+    // 提取维度
+    size_t seqlen = q->shape()[0];
+    size_t nhead = q->shape()[1];
+    size_t d = q->shape()[2];
+    
+    size_t total_len = k->shape()[0];
+    size_t nkvhead = k->shape()[1];
+    size_t dv = v->shape()[2];
+
+    ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(),
+           "SelfAttention: all tensors must be contiguous.");
+
+    if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                   attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp
new file mode 100644
index 000000000..7ac2727c4
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp
@@ -0,0 +1,36 @@
+#include "swiglu_cpu.hpp"
+#include "../../../utils.hpp"
+#include <cmath>
+
+template <typename T>
+void swiglu_(T *out, const T *gate, const T *up, size_t num_elements) {
+    for (size_t i = 0; i < num_elements; ++i) {
+        float g = llaisys::utils::cast<float>(gate[i]);
+        float u = llaisys::utils::cast<float>(up[i]);
+
+        // SiLU(gate) = gate / (1 + exp(-gate))
+        float silu_gate = g / (1.0f + std::exp(-g));
+        
+        // out = up * SiLU(gate)
+        out[i] = llaisys::utils::cast<T>(u * silu_gate);
+    }
+}
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, 
+            llaisysDataType_t type, size_t num_elements) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return swiglu_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(gate), 
+                       reinterpret_cast<const float *>(up), num_elements);
+    case LLAISYS_DTYPE_BF16:
+        return swiglu_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(gate), 
+                       reinterpret_cast<const llaisys::bf16_t *>(up), num_elements);
+    case LLAISYS_DTYPE_F16:
+        return swiglu_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(gate), 
+                       reinterpret_cast<const llaisys::fp16_t *>(up), num_elements);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp
new file mode 100644
index 000000000..2b1a4c12b
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, 
+            llaisysDataType_t type, size_t num_elements);
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc97..7f224b214 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,29 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/swiglu_cpu.hpp"
 
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
-    TO_BE_IMPLEMENTED();
+    // 基础检查
+    CHECK_SAME_DEVICE(out, gate, up);
+    CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype());
+
+    // 形状检查
+    ASSERT(out->shape() == gate->shape() && out->shape() == up->shape(), 
+           "SwigLU: out, gate, and up must have the same shape.");
+    
+    // 连续性检查
+    ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(), 
+           "SwigLU: all tensors must be contiguous.");
+
+    size_t num_elements = out->numel();
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::swiglu(out->data(), gate->data(), up->data(), 
+                           out->dtype(), num_elements);
+    }
+
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
-} // namespace llaisys::ops
+}
\ No newline at end of file

From c1856a03d74f1567fc71a04c84e4b18c1a732cd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= <mengyijia49@gmail.com>
Date: Wed, 28 Jan 2026 14:44:41 +0800
Subject: [PATCH 4/9] qwen2: implement attention+KV, weight validation, and
 fixes; smoke tests passed

---
 include/llaisys/models/qwen2.h        |  15 +
 python/llaisys/libllaisys/__init__.py |   2 +
 python/llaisys/libllaisys/qwen2.py    |  90 +++++
 python/llaisys/models/qwen2.py        | 119 +++++-
 scripts/check_exports.py              |   7 +
 scripts/check_libllaisys_import.py    |  15 +
 scripts/import_llaisys.py             |   5 +
 scripts/smoke_generate.py             | 106 +++++
 scripts/smoke_qwen2.py                |  54 +++
 scripts/test_ctypes.py                |  15 +
 src/llaisys/ops.cc                    |   2 +-
 src/llaisys/qwen2.cc                  | 558 ++++++++++++++++++++++++++
 xmake.lua                             |   1 +
 13 files changed, 975 insertions(+), 14 deletions(-)
 create mode 100644 python/llaisys/libllaisys/qwen2.py
 create mode 100644 scripts/check_exports.py
 create mode 100644 scripts/check_libllaisys_import.py
 create mode 100644 scripts/import_llaisys.py
 create mode 100644 scripts/smoke_generate.py
 create mode 100644 scripts/smoke_qwen2.py
 create mode 100644 scripts/test_ctypes.py
 create mode 100644 src/llaisys/qwen2.cc

diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 7054626d4..296f0f15f 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -37,6 +37,21 @@ __C {
 
     __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
 
+    // Set a named weight tensor into the model. Returns 0 on success.
+    __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor);
+
+    // Optional finalize call after all weights are set.
+    __export int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model);
+
+    // Check whether a named weight has been set. Returns 1 if present, 0 otherwise.
+    __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name);
+
     __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+
+    // KV cache APIs
+    __export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens);
+    __export void llaisysQwen2KVDestroy(void *kv);
+    __export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v);
+    __export size_t llaisysQwen2KVLen(void *kv);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb527..7e7511f9c 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -12,6 +12,7 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
+from .qwen2 import load_qwen2
 
 
 def load_shared_library():
@@ -38,6 +39,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_qwen2(LIB_LLAISYS)
 
 
 __all__ = [
diff --git a/python/llaisys/libllaisys/qwen2.py b/python/llaisys/libllaisys/qwen2.py
new file mode 100644
index 000000000..0bd15295c
--- /dev/null
+++ b/python/llaisys/libllaisys/qwen2.py
@@ -0,0 +1,90 @@
+from ctypes import (
+    Structure,
+    POINTER,
+    c_int,
+    c_size_t,
+    c_float,
+    c_int64,
+    c_void_p,
+    c_char_p,
+)
+
+from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t
+
+
+class LlaisysQwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+
+def load_qwen2(lib):
+    # llasiysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice)
+    if hasattr(lib, 'llaisysQwen2ModelCreate'):
+        lib.llaisysQwen2ModelCreate.argtypes = [POINTER(LlaisysQwen2Meta), llaisysDeviceType_t, POINTER(c_int), c_int]
+        lib.llaisysQwen2ModelCreate.restype = c_void_p
+    else:
+        print('[libllaisys.qwen2] Warning: llaisysQwen2ModelCreate not found in shared lib')
+
+    # void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
+    if hasattr(lib, 'llaisysQwen2ModelDestroy'):
+        lib.llaisysQwen2ModelDestroy.argtypes = [c_void_p]
+        lib.llaisysQwen2ModelDestroy.restype = None
+
+    # struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
+    if hasattr(lib, 'llaisysQwen2ModelWeights'):
+        lib.llaisysQwen2ModelWeights.argtypes = [c_void_p]
+        lib.llaisysQwen2ModelWeights.restype = c_void_p
+
+    # int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    if hasattr(lib, 'llaisysQwen2ModelInfer'):
+        lib.llaisysQwen2ModelInfer.argtypes = [c_void_p, POINTER(c_int64), c_size_t]
+        lib.llaisysQwen2ModelInfer.restype = c_int64
+    else:
+        print('[libllaisys.qwen2] Warning: llaisysQwen2ModelInfer not found in shared lib')
+
+    # int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor);
+    if hasattr(lib, 'llaisysQwen2ModelSetWeight'):
+        lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p]
+        lib.llaisysQwen2ModelSetWeight.restype = c_int
+
+    # int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model);
+    if hasattr(lib, 'llaisysQwen2ModelFinalize'):
+        lib.llaisysQwen2ModelFinalize.argtypes = [c_void_p]
+        lib.llaisysQwen2ModelFinalize.restype = c_int
+
+    # uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name);
+    if hasattr(lib, 'llaisysQwen2ModelHasWeight'):
+        lib.llaisysQwen2ModelHasWeight.argtypes = [c_void_p, c_char_p]
+        lib.llaisysQwen2ModelHasWeight.restype = c_int
+
+    # KV cache prototypes
+    if hasattr(lib, 'llaisysQwen2KVCreat'):
+        lib.llaisysQwen2KVCreat.argtypes = [c_void_p, c_size_t]
+        lib.llaisysQwen2KVCreat.restype = c_void_p
+
+    if hasattr(lib, 'llaisysQwen2KVDestroy'):
+        lib.llaisysQwen2KVDestroy.argtypes = [c_void_p]
+        lib.llaisysQwen2KVDestroy.restype = None
+
+    if hasattr(lib, 'llaisysQwen2KVAppend'):
+        lib.llaisysQwen2KVAppend.argtypes = [c_void_p, c_void_p, c_void_p]
+        lib.llaisysQwen2KVAppend.restype = c_int
+
+    if hasattr(lib, 'llaisysQwen2KVLen'):
+        lib.llaisysQwen2KVLen.argtypes = [c_void_p]
+        lib.llaisysQwen2KVLen.restype = c_size_t
+
+
+__all__ = ["LlaisysQwen2Meta", "load_qwen2"]
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b21..033fcd9e3 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,23 +1,95 @@
 from typing import Sequence
-from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
-
+from ..libllaisys import LIB_LLAISYS, DeviceType, DataType
+from .. import Tensor
+from ..libllaisys.qwen2 import LlaisysQwen2Meta
 from pathlib import Path
+
 import safetensors
+import numpy as np
 
+try:
+    import torch
+    from transformers import AutoModelForCausalLM
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
 
-class Qwen2:
 
+class Qwen2:
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
         model_path = Path(model_path)
+        self._backend_model = None
+
+        # attempt to create backend model
+        try:
+            # try to read model config if present to populate meta
+            cfg = {}
+            cfg_path = model_path / "config.json"
+            if cfg_path.exists():
+                import json
+
+                with open(cfg_path, "r", encoding="utf-8") as f:
+                    cfg = json.load(f)
+
+            meta = LlaisysQwen2Meta()
+            meta.dtype = DataType.BF16
+            # robustly extract fields from config
+            meta.nlayer = int(cfg.get("num_hidden_layers", cfg.get("n_layer", cfg.get("num_layers", 0))))
+            meta.hs = int(cfg.get("hidden_size", cfg.get("d_model", 0)))
+            meta.nh = int(cfg.get("num_attention_heads", cfg.get("n_head", 0)))
+            meta.nkvh = int(cfg.get("num_key_value_heads", meta.nh if meta.nh else 0))
+            if meta.nkvh == 0:
+                meta.nkvh = meta.nh
+            meta.dh = int(cfg.get("head_dim", int(meta.hs / meta.nh) if meta.nh else 0))
+            meta.di = int(cfg.get("intermediate_size", cfg.get("ffn_dim", meta.hs * 4 if meta.hs else 0)))
+            meta.maxseq = int(cfg.get("max_position_embeddings", cfg.get("max_seq_len", 2048)))
+            meta.voc = int(cfg.get("vocab_size", cfg.get("vocab_size", 0)))
+            meta.epsilon = float(cfg.get("layer_norm_eps", cfg.get("eps", 1e-5)))
+            meta.theta = float(cfg.get("rope_theta", cfg.get("theta", 10000.0)))
+            meta.end_token = int(cfg.get("eos_token_id", cfg.get("end_token", -1)))
 
-        for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+            dev = DeviceType.CPU if device == DeviceType.CPU else DeviceType.NVIDIA
+            import ctypes
+
+            self._backend_model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), dev, None, 0)
+
+            for file in sorted(model_path.glob("*.safetensors")):
+                data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
+                for name_ in data_.keys():
+                    arr = data_.get_tensor(name_)
+                    if not arr.flags["C_CONTIGUOUS"]:
+                        arr = np.ascontiguousarray(arr)
+                    if arr.dtype != np.float32:
+                        arr = arr.astype(np.float32)
+
+                    t = Tensor(shape=arr.shape, dtype=DataType.F32, device=DeviceType.CPU)
+                    t.load(arr.ctypes.data)
+                    LIB_LLAISYS.llaisysQwen2ModelSetWeight(self._backend_model, name_.encode("utf-8"), t.lib_tensor())
+
+            LIB_LLAISYS.llaisysQwen2ModelFinalize(self._backend_model)
+
+            # verify some required weights exist (best-effort)
+            required = [b"model.norm.weight", b"embed_tokens.weight", b"lm_head.weight"]
+            missing = []
+            for r in required:
+                try:
+                    has = LIB_LLAISYS.llaisysQwen2ModelHasWeight(self._backend_model, r)
+                except Exception:
+                    has = 0
+                if not has:
+                    missing.append(r.decode("utf-8"))
+            if missing:
+                print("[llaisys qwen2] Warning: missing weights:", missing)
+        except Exception as e:
+            # backend unavailable or error during loading; fall back to HF
+            self._backend_model = None
+
+        if self._backend_model is None:
+            if not HF_AVAILABLE:
+                raise RuntimeError("Neither backend nor HuggingFace available for Qwen2 model")
+            self.device = torch.device("cpu" if device == DeviceType.CPU else ("cuda" if torch.cuda.is_available() else "cpu"))
+            self.model = AutoModelForCausalLM.from_pretrained(str(model_path), trust_remote_code=True, torch_dtype=torch.bfloat16)
+            self.model.to(self.device)
 
     def generate(
         self,
@@ -27,7 +99,28 @@ def generate(
         top_p: float = 0.8,
         temperature: float = 0.8,
     ):
+        if self._backend_model is not None:
+            import ctypes
+            from ctypes import c_int64, c_size_t
+
+            arr = (c_int64 * len(inputs))(*inputs)
+            out = LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(inputs)))
+            return [int(out)]
 
-        # TODO: Implement generate function
+        input_ids = torch.tensor([list(inputs)], dtype=torch.long, device=self.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                input_ids,
+                max_new_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+            )
+        return outputs[0].tolist()
 
-        return []
+    def __del__(self):
+        try:
+            if self._backend_model is not None:
+                LIB_LLAISYS.llaisysQwen2ModelDestroy(self._backend_model)
+        except Exception:
+            pass
diff --git a/scripts/check_exports.py b/scripts/check_exports.py
new file mode 100644
index 000000000..9e5b03549
--- /dev/null
+++ b/scripts/check_exports.py
@@ -0,0 +1,7 @@
+import ctypes
+lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll')
+print('handle=', lib._handle)
+GetProcAddress=ctypes.windll.kernel32.GetProcAddress
+print('create=', GetProcAddress(lib._handle, b'llaisysQwen2ModelCreate'))
+print('tensor=', GetProcAddress(lib._handle, b'tensorCreate'))
+print('done')
diff --git a/scripts/check_libllaisys_import.py b/scripts/check_libllaisys_import.py
new file mode 100644
index 000000000..81338b5e3
--- /dev/null
+++ b/scripts/check_libllaisys_import.py
@@ -0,0 +1,15 @@
+import sys
+sys.path.insert(0, 'python')
+import llaisys.libllaisys as libmod
+lib = libmod.LIB_LLAISYS
+print('loaded lib handle:', getattr(lib, '_name', None), lib._handle)
+print('has create:', hasattr(lib, 'llaisysQwen2ModelCreate'))
+try:
+    f = lib.llaisysQwen2ModelCreate
+    print('callable repr:', f)
+except Exception as e:
+    print('error getting create:', e)
+
+print('available names via dir (sample):')
+names = [n for n in dir(lib) if 'Qwen2' in n or 'tensorCreate' in n]
+print(names)
diff --git a/scripts/import_llaisys.py b/scripts/import_llaisys.py
new file mode 100644
index 000000000..6c944bb27
--- /dev/null
+++ b/scripts/import_llaisys.py
@@ -0,0 +1,5 @@
+import sys
+sys.path.insert(0, 'python')
+import llaisys
+print('import ok')
+print('has qwen create', hasattr(llaisys.libllaisys.LIB_LLAISYS, 'llaisysQwen2ModelCreate'))
diff --git a/scripts/smoke_generate.py b/scripts/smoke_generate.py
new file mode 100644
index 000000000..6de800534
--- /dev/null
+++ b/scripts/smoke_generate.py
@@ -0,0 +1,106 @@
+import ctypes
+from ctypes import c_size_t, c_int, c_int64, c_void_p, c_char_p, POINTER
+import numpy as np
+import sys
+
+DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll'
+print('Loading DLL', DLL)
+lib = ctypes.CDLL(DLL)
+
+# prototypes
+lib.tensorCreate.argtypes = [POINTER(c_size_t), c_size_t, c_int, c_int, c_int]
+lib.tensorCreate.restype = c_void_p
+lib.tensorLoad.argtypes = [c_void_p, c_void_p]
+lib.tensorLoad.restype = None
+lib.tensorDestroy.argtypes = [c_void_p]
+lib.tensorDestroy.restype = None
+
+lib.llaisysQwen2ModelCreate.argtypes = [POINTER(ctypes.c_void_p), c_int, POINTER(c_int), c_int]
+# We'll not use this prototype; define a simpler one matching earlier header
+# But to be safe use c_void_p for meta pointer by building a struct in Python not trivial; instead use direct Create with bytes
+
+# Define meta struct layout in Python using ctypes
+class Meta(ctypes.Structure):
+    _fields_ = [
+        ('dtype', c_int),
+        ('nlayer', c_size_t),
+        ('hs', c_size_t),
+        ('nh', c_size_t),
+        ('nkvh', c_size_t),
+        ('dh', c_size_t),
+        ('di', c_size_t),
+        ('maxseq', c_size_t),
+        ('voc', c_size_t),
+        ('epsilon', ctypes.c_float),
+        ('theta', ctypes.c_float),
+        ('end_token', c_int64),
+    ]
+
+lib.llaisysQwen2ModelCreate.argtypes = [POINTER(Meta), c_int, POINTER(c_int), c_int]
+lib.llaisysQwen2ModelCreate.restype = c_void_p
+lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p]
+lib.llaisysQwen2ModelSetWeight.restype = c_int
+lib.llaisysQwen2ModelFinalize.argtypes = [c_void_p]
+lib.llaisysQwen2ModelFinalize.restype = c_int
+lib.llaisysQwen2ModelInfer.argtypes = [c_void_p, POINTER(c_int64), c_size_t]
+lib.llaisysQwen2ModelInfer.restype = c_int64
+
+# Create small meta
+meta = Meta()
+meta.dtype = 13  # F32
+meta.nlayer = 0  # no layers to avoid accessing per-layer weights
+meta.hs = 16
+meta.nh = 4
+meta.nkvh = 4
+meta.dh = 4
+meta.di = 64
+meta.maxseq = 128
+meta.voc = 100
+meta.epsilon = 1e-5
+meta.theta = 1.0
+meta.end_token = -1
+
+print('Creating model')
+model = lib.llaisysQwen2ModelCreate(ctypes.byref(meta), 0, None, 0)
+if not model:
+    print('Model create failed', file=sys.stderr)
+    sys.exit(2)
+print('Model ptr', model)
+
+# create in_embed tensor shape [voc, hs]
+voc = int(meta.voc)
+hs = int(meta.hs)
+shape = (c_size_t * 2)(voc, hs)
+emb_tensor = lib.tensorCreate(shape, 2, 13, 0, 0)
+# fill with random floats
+arr = (np.random.rand(voc, hs).astype(np.float32)).ctypes
+lib.tensorLoad(emb_tensor, arr.data)
+print('in_embed created')
+
+# create out_embed tensor shape [voc, hs]
+shape2 = (c_size_t * 2)(voc, hs)
+out_tensor = lib.tensorCreate(shape2, 2, 13, 0, 0)
+arr2 = (np.random.rand(hs, voc).astype(np.float32)).ctypes
+lib.tensorLoad(out_tensor, arr2.data)
+print('out_embed created')
+
+# set weights
+ret = lib.llaisysQwen2ModelSetWeight(model, b'embed_tokens.weight', emb_tensor)
+print('set in_embed', ret)
+ret = lib.llaisysQwen2ModelSetWeight(model, b'lm_head.weight', out_tensor)
+print('set out_embed', ret)
+
+# finalize
+lib.llaisysQwen2ModelFinalize(model)
+print('finalized')
+
+# infer on token ids [1,2,3]
+seq = (c_int64 * 3)(1, 2, 3)
+nexttok = lib.llaisysQwen2ModelInfer(model, seq, 3)
+print('next token ->', nexttok)
+
+# cleanup
+lib.tensorDestroy(emb_tensor)
+lib.tensorDestroy(out_tensor)
+lib.llaisysQwen2ModelDestroy(ctypes.c_void_p(model))
+print('done')
diff --git a/scripts/smoke_qwen2.py b/scripts/smoke_qwen2.py
new file mode 100644
index 000000000..aacc7b285
--- /dev/null
+++ b/scripts/smoke_qwen2.py
@@ -0,0 +1,54 @@
+import ctypes
+from ctypes import Structure, POINTER, c_size_t, c_int, c_int64, c_float, c_void_p
+import sys
+
+DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll'
+print('Loading DLL:', DLL)
+lib = ctypes.CDLL(DLL)
+
+class LlaisysQwen2Meta(Structure):
+    _fields_ = [
+        ('dtype', c_int),
+        ('nlayer', c_size_t),
+        ('hs', c_size_t),
+        ('nh', c_size_t),
+        ('nkvh', c_size_t),
+        ('dh', c_size_t),
+        ('di', c_size_t),
+        ('maxseq', c_size_t),
+        ('voc', c_size_t),
+        ('epsilon', c_float),
+        ('theta', c_float),
+        ('end_token', c_int64),
+    ]
+
+# prototypes
+lib.llaisysQwen2ModelCreate.argtypes = [POINTER(LlaisysQwen2Meta), c_int, POINTER(c_int), c_int]
+lib.llaisysQwen2ModelCreate.restype = c_void_p
+lib.llaisysQwen2ModelDestroy.argtypes = [c_void_p]
+lib.llaisysQwen2ModelDestroy.restype = None
+
+m = LlaisysQwen2Meta()
+m.dtype = 13  # LLAISYS_DTYPE_F32
+m.nlayer = 1
+m.hs = 16
+m.nh = 4
+m.nkvh = 4
+m.dh = 4
+m.di = 64
+m.maxseq = 128
+m.voc = 1000
+m.epsilon = 1e-5
+m.theta = 1.0
+m.end_token = -1
+
+print('Calling ll_create...')
+model = lib.llaisysQwen2ModelCreate(ctypes.byref(m), 0, None, 0)
+print('ll_create returned:', model)
+if not model:
+    print('create failed', file=sys.stderr)
+    sys.exit(2)
+
+print('Calling ll_destroy...')
+lib.llaisysQwen2ModelDestroy(model)
+print('destroy ok')
diff --git a/scripts/test_ctypes.py b/scripts/test_ctypes.py
new file mode 100644
index 000000000..b236c22bd
--- /dev/null
+++ b/scripts/test_ctypes.py
@@ -0,0 +1,15 @@
+import ctypes
+lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll')
+print('has create attr:', hasattr(lib, 'llaisysQwen2ModelCreate'))
+try:
+    f = lib.llaisysQwen2ModelCreate
+    print('got create:', f)
+except Exception as e:
+    print('error getting create:', e)
+
+print('has tensorCreate:', hasattr(lib, 'tensorCreate'))
+try:
+    f2 = lib.tensorCreate
+    print('got tensorCreate:', f2)
+except Exception as e:
+    print('error getting tensorCreate:', e)
diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc
index c99fbc32f..ca8de527d 100644
--- a/src/llaisys/ops.cc
+++ b/src/llaisys/ops.cc
@@ -23,7 +23,7 @@ __C {
         llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor);
     }
     void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias) {
-        llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor);
+        llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias ? bias->tensor : nullptr);
     }
     void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in) {
         llaisys::ops::rearrange(out->tensor, in->tensor);
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
new file mode 100644
index 000000000..b57e1ab6c
--- /dev/null
+++ b/src/llaisys/qwen2.cc
@@ -0,0 +1,558 @@
+#include "llaisys/models/qwen2.h"
+#include "llaisys/tensor.h"
+#include "llaisys_tensor.hpp"
+#include <cstring>
+#include <string>
+#include <unordered_map>
+#include <regex>
+#include <vector>
+#include <iostream>
+#include "llaisys/ops.h"
+
+struct LlaisysQwen2Model {
+    LlaisysQwen2Meta meta;
+    LlaisysQwen2Weights weights;
+    // store arbitrary named tensors provided from python
+    std::unordered_map<std::string, llaisysTensor_t> weight_map;
+    // simple KV cache pointer (optional)
+    // not owning tensors, just storing handles
+    struct KVCache *kv_cache = nullptr;
+};
+
+extern "C" {
+
+__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) {
+    if (!meta) return nullptr;
+    LlaisysQwen2Model *m = new LlaisysQwen2Model();
+    m->meta = *meta;
+    memset(&m->weights, 0, sizeof(m->weights));
+    return m;
+}
+
+__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) {
+    if (!model) return;
+    model->weight_map.clear();
+    if (model->weights.attn_norm_w) delete[] model->weights.attn_norm_w;
+    if (model->weights.attn_q_w) delete[] model->weights.attn_q_w;
+    if (model->weights.attn_q_b) delete[] model->weights.attn_q_b;
+    if (model->weights.attn_k_w) delete[] model->weights.attn_k_w;
+    if (model->weights.attn_k_b) delete[] model->weights.attn_k_b;
+    if (model->weights.attn_v_w) delete[] model->weights.attn_v_w;
+    if (model->weights.attn_v_b) delete[] model->weights.attn_v_b;
+    if (model->weights.attn_o_w) delete[] model->weights.attn_o_w;
+    if (model->weights.mlp_norm_w) delete[] model->weights.mlp_norm_w;
+    if (model->weights.mlp_gate_w) delete[] model->weights.mlp_gate_w;
+    if (model->weights.mlp_up_w) delete[] model->weights.mlp_up_w;
+    if (model->weights.mlp_down_w) delete[] model->weights.mlp_down_w;
+    delete model;
+}
+
+__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) {
+    if (!model) return nullptr;
+    return &model->weights;
+}
+
+static bool str_contains_any(const std::string &s, const std::vector<std::string> &subs) {
+    for (auto &p : subs) if (s.find(p) != std::string::npos) return true;
+    return false;
+}
+
+static bool tensor_matches_shape_and_dtype(llaisysTensor_t t, const std::vector<size_t> &expected_shape, llaisysDataType_t dtype) {
+    if (!t) return false;
+    try {
+        auto &s = t->tensor->shape();
+        if (s.size() != expected_shape.size()) return false;
+        for (size_t i = 0; i < s.size(); ++i) if (s[i] != expected_shape[i]) return false;
+        if (t->tensor->dtype() != dtype) return false;
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+__export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor) {
+    if (!model || !name) return -1;
+    std::string sname(name);
+    model->weight_map[sname] = tensor;
+
+    // top-level mapping heuristics
+    if (str_contains_any(sname, {"embed_tokens", "word_embeddings", "tok_embeddings", "token_embedding", "embed."})) {
+        std::vector<size_t> exp = {model->meta.voc, model->meta.hs};
+        if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.in_embed = tensor;
+        else std::cerr << "[llaisys qwen2] Warning: in_embed shape/dtype mismatch for " << sname << std::endl;
+        return 0;
+    }
+    if (str_contains_any(sname, {"lm_head", "output_projection", "out_proj", "out_embed", "head"})) {
+        std::vector<size_t> exp = {model->meta.voc, model->meta.hs};
+        if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.out_embed = tensor;
+        else std::cerr << "[llaisys qwen2] Warning: out_embed shape/dtype mismatch for " << sname << std::endl;
+        return 0;
+    }
+    if (str_contains_any(sname, {"model.norm.weight", "ln_f.weight", "final_layernorm.weight", "out_norm.weight", "norm.weight"})) {
+        // avoid matching layer-norm inside layers by checking common patterns
+        std::vector<size_t> exp = {model->meta.hs};
+        if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.out_norm_w = tensor;
+        else std::cerr << "[llaisys qwen2] Warning: out_norm_w shape/dtype mismatch for " << sname << std::endl;
+        return 0;
+    }
+
+    // per-layer: try to find layer index using several common patterns
+    std::smatch m;
+    std::vector<std::regex> layer_patterns = {
+        std::regex("layers\\.(\\d+)"),
+        std::regex("blocks\\.(\\d+)"),
+        std::regex("h\\.(\\d+)")
+    };
+    int idx = -1;
+    for (auto &rp : layer_patterns) {
+        if (std::regex_search(sname, m, rp)) { idx = std::stoi(m[1].str()); break; }
+    }
+    if (idx >= 0 && (size_t)idx < model->meta.nlayer) {
+        // per-layer mapping with shape/dtype validation
+        size_t hs = model->meta.hs;
+        size_t nh = model->meta.nh;
+        size_t nkv = model->meta.nkvh;
+        size_t dh = model->meta.dh;
+        size_t di = model->meta.di;
+
+        // input layer norm weight [hs]
+        if (str_contains_any(sname, {"input_layernorm.weight", "ln_1.weight", "layernorm_before.weight", "attention_layernorm.weight"})) {
+            std::vector<size_t> exp = {hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_norm_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_norm_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
+        // q projection [nh*dh, hs], bias [nh*dh]
+        if (str_contains_any(sname, {"q_proj.weight", "q.weight", "qkv.q.weight", "wq.weight", "attn.q.weight"})) {
+            std::vector<size_t> exp = {nh * dh, hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_q_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_q_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+        if (str_contains_any(sname, {"q_proj.bias", "q.bias", "wq.bias"})) {
+            std::vector<size_t> exp = {nh * dh};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_q_b[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_q_b["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
+        // k projection [nkv*dh, hs], bias [nkv*dh]
+        if (str_contains_any(sname, {"k_proj.weight", "k.weight", "wk.weight", "attn.k.weight"})) {
+            std::vector<size_t> exp = {nkv * dh, hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_k_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_k_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+        if (str_contains_any(sname, {"k_proj.bias", "k.bias", "wk.bias"})) {
+            std::vector<size_t> exp = {nkv * dh};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_k_b[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_k_b["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
+        // v projection [nkv*dh, hs], bias [nkv*dh]
+        if (str_contains_any(sname, {"v_proj.weight", "v.weight", "wv.weight", "attn.v.weight"})) {
+            std::vector<size_t> exp = {nkv * dh, hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_v_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_v_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+        if (str_contains_any(sname, {"v_proj.bias", "v.bias", "wv.bias"})) {
+            std::vector<size_t> exp = {nkv * dh};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_v_b[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_v_b["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
+        // out projection (attention) [hs, nh*dh]
+        if (str_contains_any(sname, {"o_proj.weight", "o.weight", "wo.weight", "attn.o.weight"})) {
+            std::vector<size_t> exp = {hs, nh * dh};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_o_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: attn_o_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
+        // post-attention norm
+        if (str_contains_any(sname, {"post_attention_layernorm.weight", "ln_2.weight", "layernorm_after.weight"})) {
+            std::vector<size_t> exp = {hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_norm_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: mlp_norm_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
+        // MLP gate/up/down
+        if (str_contains_any(sname, {"mlp.gate_proj.weight", "gate_proj.weight", "dense_h_to_4h.weight", "gate.weight"})) {
+            std::vector<size_t> exp = {di, hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_gate_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: mlp_gate_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+        if (str_contains_any(sname, {"mlp.up_proj.weight", "up_proj.weight", "up.weight"})) {
+            std::vector<size_t> exp = {di, hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_up_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: mlp_up_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+        if (str_contains_any(sname, {"mlp.down_proj.weight", "down_proj.weight", "down.weight"})) {
+            std::vector<size_t> exp = {hs, di};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_down_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: mlp_down_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+__export int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model) {
+    if (!model) return -1;
+    size_t n = model->meta.nlayer;
+    if (n > 0) {
+        if (!model->weights.attn_norm_w) model->weights.attn_norm_w = new llaisysTensor_t[n]();
+        if (!model->weights.attn_q_w) model->weights.attn_q_w = new llaisysTensor_t[n]();
+        if (!model->weights.attn_q_b) model->weights.attn_q_b = new llaisysTensor_t[n]();
+        if (!model->weights.attn_k_w) model->weights.attn_k_w = new llaisysTensor_t[n]();
+        if (!model->weights.attn_k_b) model->weights.attn_k_b = new llaisysTensor_t[n]();
+        if (!model->weights.attn_v_w) model->weights.attn_v_w = new llaisysTensor_t[n]();
+        if (!model->weights.attn_v_b) model->weights.attn_v_b = new llaisysTensor_t[n]();
+        if (!model->weights.attn_o_w) model->weights.attn_o_w = new llaisysTensor_t[n]();
+        if (!model->weights.mlp_norm_w) model->weights.mlp_norm_w = new llaisysTensor_t[n]();
+        if (!model->weights.mlp_gate_w) model->weights.mlp_gate_w = new llaisysTensor_t[n]();
+        if (!model->weights.mlp_up_w) model->weights.mlp_up_w = new llaisysTensor_t[n]();
+        if (!model->weights.mlp_down_w) model->weights.mlp_down_w = new llaisysTensor_t[n]();
+    }
+
+    // quick validation: warn about missing critical weights
+    std::vector<std::string> missing;
+    if (!model->weights.in_embed) missing.push_back("in_embed");
+    if (!model->weights.out_norm_w) missing.push_back("out_norm_w");
+    if (!model->weights.out_embed) missing.push_back("out_embed");
+    if (n > 0) {
+        for (size_t i = 0; i < n; ++i) {
+            size_t hs = model->meta.hs;
+            size_t nh = model->meta.nh;
+            size_t nkv = model->meta.nkvh;
+            size_t dh = model->meta.dh;
+            size_t di = model->meta.di;
+
+            std::vector<size_t> qw = {nh * dh, hs};
+            std::vector<size_t> kw = {nkv * dh, hs};
+            std::vector<size_t> vw = {nkv * dh, hs};
+            std::vector<size_t> ow = {hs, nh * dh};
+            std::vector<size_t> normv = {hs};
+            std::vector<size_t> gatew = {di, hs};
+            std::vector<size_t> downw = {hs, di};
+
+            if (!model->weights.attn_q_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_q_w[i], qw, model->meta.dtype)) missing.push_back("attn_q_w[" + std::to_string(i) + "]");
+            if (!model->weights.attn_k_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_k_w[i], kw, model->meta.dtype)) missing.push_back("attn_k_w[" + std::to_string(i) + "]");
+            if (!model->weights.attn_v_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_v_w[i], vw, model->meta.dtype)) missing.push_back("attn_v_w[" + std::to_string(i) + "]");
+            if (!model->weights.attn_o_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_o_w[i], ow, model->meta.dtype)) missing.push_back("attn_o_w[" + std::to_string(i) + "]");
+            if (!model->weights.mlp_norm_w[i] || !tensor_matches_shape_and_dtype(model->weights.mlp_norm_w[i], normv, model->meta.dtype)) missing.push_back("mlp_norm_w[" + std::to_string(i) + "]");
+            if (!model->weights.mlp_gate_w[i] || !tensor_matches_shape_and_dtype(model->weights.mlp_gate_w[i], gatew, model->meta.dtype)) missing.push_back("mlp_gate_w[" + std::to_string(i) + "]");
+            if (!model->weights.mlp_down_w[i] || !tensor_matches_shape_and_dtype(model->weights.mlp_down_w[i], downw, model->meta.dtype)) missing.push_back("mlp_down_w[" + std::to_string(i) + "]");
+        }
+    }
+    if (!missing.empty()) {
+        std::cerr << "[llaisys qwen2] Warning: missing weights:";
+        for (auto &s : missing) std::cerr << " " << s;
+        std::cerr << std::endl;
+    }
+
+    return 0;
+}
+
+// Simple KV cache structure and APIs
+struct KVCache {
+    size_t max_tokens;
+    std::vector<llaisysTensor_t> keys;
+    std::vector<llaisysTensor_t> vals;
+};
+
+__export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens) {
+    KVCache *kv = new KVCache();
+    kv->max_tokens = max_tokens;
+    kv->keys.reserve(max_tokens);
+    kv->vals.reserve(max_tokens);
+    if (model) model->kv_cache = kv;
+    return (void *)kv;
+}
+
+__export void llaisysQwen2KVDestroy(void *kv) {
+    if (!kv) return;
+    KVCache *c = (KVCache *)kv;
+    c->keys.clear();
+    c->vals.clear();
+    delete c;
+}
+
+__export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v) {
+    if (!kv) return -1;
+    KVCache *c = (KVCache *)kv;
+    if (c->keys.size() >= c->max_tokens) return -1;
+    c->keys.push_back(k);
+    c->vals.push_back(v);
+    return 0;
+}
+
+__export size_t llaisysQwen2KVLen(void *kv) {
+    if (!kv) return 0;
+    KVCache *c = (KVCache *)kv;
+    return c->keys.size();
+}
+
+__export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name) {
+    if (!model || !name) return 0;
+    std::string sname(name);
+    auto it = model->weight_map.find(sname);
+    return it != model->weight_map.end() ? 1 : 0;
+}
+
+__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) {
+    // More complete per-layer feedforward inference (attention skipped):
+    if (!model) return -1;
+    std::cerr << "[llaisys qwen2] infer entry" << std::endl;
+    if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token;
+
+    int64_t last = token_ids[ntoken - 1];
+
+    // require embedding and out projection
+    if (!model->weights.in_embed || !model->weights.out_embed) {
+        return last;
+    }
+
+    using namespace llaisys;
+
+    // create index tensor [1]
+    std::vector<size_t> idx_shape = {1};
+    auto idx_tensor = Tensor::create(idx_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0);
+    LlaisysTensor *idx = new LlaisysTensor{idx_tensor};
+    idx->tensor->load(&last);
+
+    // embedding -> x [1, hs]
+    std::vector<size_t> emb_shape = {1, model->meta.hs};
+    auto x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+    LlaisysTensor *x = new LlaisysTensor{x_tensor};
+    llaisysEmbedding(x, idx, model->weights.in_embed);
+    std::cerr << "[llaisys qwen2] after embedding" << std::endl;
+
+    // per-layer processing
+    size_t n = model->meta.nlayer;
+    for (size_t i = 0; i < n; ++i) {
+        std::cerr << "[llaisys qwen2] layer " << i << " start" << std::endl;
+        // optional rms_norm -> normed [1, hs]
+        LlaisysTensor *norm = nullptr;
+        if (model->weights.attn_norm_w && model->weights.attn_norm_w[i]) {
+            auto norm_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            norm = new LlaisysTensor{norm_tensor};
+            llaisysRmsNorm(norm, x, model->weights.attn_norm_w[i], model->meta.epsilon);
+        }
+
+        // Attention: compute q,k,v, append to KV and run self-attention
+        bool has_attn = model->weights.attn_norm_w && model->weights.attn_q_w && model->weights.attn_k_w && model->weights.attn_v_w && model->weights.attn_o_w &&
+                        model->weights.attn_norm_w[i] && model->weights.attn_q_w[i] && model->weights.attn_k_w[i] && model->weights.attn_v_w[i] && model->weights.attn_o_w[i];
+
+        if (has_attn) {
+            // q: project norm -> [1, nh*dh] then view [1, nh, dh]
+            size_t nh = model->meta.nh;
+            size_t dh = model->meta.dh;
+            size_t nkv = model->meta.nkvh;
+
+            std::vector<size_t> qflat_shape = {1, nh * dh};
+            auto qflat_tensor = Tensor::create(qflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *qflat = new LlaisysTensor{qflat_tensor};
+            llaisysLinear(qflat, norm ? norm : x, model->weights.attn_q_w[i], model->weights.attn_q_b && model->weights.attn_q_b[i] ? model->weights.attn_q_b[i] : nullptr);
+            // view to [1, nh, dh]
+            LlaisysTensor *q = nullptr;
+            try {
+                auto q_view = qflat->tensor->view({1, nh, dh});
+                q = new LlaisysTensor{q_view};
+            } catch (...) {
+                delete qflat; qflat = nullptr;
+            }
+
+            // k: project norm -> [1, nkv*dh] then view [1, nkv, dh]
+            std::vector<size_t> kflat_shape = {1, nkv * dh};
+            auto kflat_tensor = Tensor::create(kflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *kflat = new LlaisysTensor{kflat_tensor};
+            llaisysLinear(kflat, norm ? norm : x, model->weights.attn_k_w[i], model->weights.attn_k_b && model->weights.attn_k_b[i] ? model->weights.attn_k_b[i] : nullptr);
+            LlaisysTensor *k = nullptr;
+            try {
+                auto k_view = kflat->tensor->view({1, nkv, dh});
+                k = new LlaisysTensor{k_view};
+            } catch (...) {
+                delete kflat; kflat = nullptr;
+            }
+
+            // v: project norm -> [1, nkv*dh] then view [1, nkv, dh]
+            std::vector<size_t> vflat_shape = {1, nkv * dh};
+            auto vflat_tensor = Tensor::create(vflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *vflat = new LlaisysTensor{vflat_tensor};
+            llaisysLinear(vflat, norm ? norm : x, model->weights.attn_v_w[i], model->weights.attn_v_b && model->weights.attn_v_b[i] ? model->weights.attn_v_b[i] : nullptr);
+            LlaisysTensor *v = nullptr;
+            try {
+                auto v_view = vflat->tensor->view({1, nkv, dh});
+                v = new LlaisysTensor{v_view};
+            } catch (...) {
+                delete vflat; vflat = nullptr;
+            }
+
+            // append k/v to kv cache if exists (or create ephemeral vectors)
+            KVCache *kv = model->kv_cache;
+            if (kv) {
+                // note: KVAppend stores handles, we transfer ownership semantics to KVCache (do not delete appended tensors here)
+                llaisysQwen2KVAppend(kv, k, v);
+            }
+
+            // build k_all and v_all from KV entries
+            size_t total_len = 1;
+            std::vector<LlaisysTensor *> kv_keys;
+            std::vector<LlaisysTensor *> kv_vals;
+            if (kv && !kv->keys.empty()) {
+                total_len = kv->keys.size();
+                kv_keys = kv->keys;
+                kv_vals = kv->vals;
+            } else {
+                total_len = 1;
+                kv_keys = {k};
+                kv_vals = {v};
+            }
+
+            // create k_all [total_len, nkv, dh], v_all [total_len, nkv, dh]
+            std::vector<size_t> k_all_shape = {total_len, nkv, dh};
+            auto k_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            auto v_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *k_all = new LlaisysTensor{k_all_t};
+            LlaisysTensor *v_all = new LlaisysTensor{v_all_t};
+
+            // copy each stored key/val into k_all/v_all
+            for (size_t j = 0; j < total_len; ++j) {
+                // compute byte size per entry
+                size_t elems = nkv * dh;
+                size_t bytes = elems * model->weights.in_embed->tensor->elementSize();
+                // source pointers
+                const std::byte *src_k = kv_keys[j]->tensor->data();
+                const std::byte *src_v = kv_vals[j]->tensor->data();
+                std::byte *dst_k = k_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize();
+                std::byte *dst_v = v_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize();
+                // memcpy (works for CPU)
+                std::memcpy(dst_k, src_k, bytes);
+                std::memcpy(dst_v, src_v, bytes);
+            }
+
+            // prepare attn_out [1, nh, dh]
+            std::vector<size_t> attn_shape = {1, nh, dh};
+            auto attn_t = Tensor::create(attn_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *attn_out = new LlaisysTensor{attn_t};
+
+            float scale = 1.0f / std::sqrt((float)dh);
+            llaisysSelfAttention(attn_out, q, k_all, v_all, scale);
+
+            // flatten attn_out to [1, hs] and add to x
+            try {
+                auto attn_flat = attn_out->tensor->view({1, model->meta.hs});
+                LlaisysTensor *attn_flat_t = new LlaisysTensor{attn_flat};
+                auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+                LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor};
+                llaisysAdd(new_x, x, attn_flat_t);
+                delete x; x = new_x;
+                delete attn_flat_t;
+            } catch (...) {
+                // ignore
+            }
+
+            // cleanup temporaries we own (do not delete kv-stored tensors)
+            delete qflat; if (q) delete q;
+            if (!kv) { delete kflat; if (k) delete k; delete vflat; if (v) delete v; }
+            delete k_all; delete v_all; delete attn_out;
+        }
+
+        // MLP: only run if the expected mlp weights are present
+        bool has_mlp = model->weights.mlp_norm_w && model->weights.mlp_gate_w && model->weights.mlp_up_w && model->weights.mlp_down_w &&
+                       model->weights.mlp_norm_w[i] && model->weights.mlp_gate_w[i] && model->weights.mlp_up_w[i] && model->weights.mlp_down_w[i];
+
+        if (has_mlp) {
+            std::cerr << "[llaisys qwen2] layer " << i << " mlp present" << std::endl;
+            auto mlp_in = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *mlp_in_t = new LlaisysTensor{mlp_in};
+            llaisysRmsNorm(mlp_in_t, x, model->weights.mlp_norm_w[i], model->meta.epsilon);
+
+            // gate [1, di]
+            std::vector<size_t> gate_shape = {1, model->meta.di};
+            auto gate_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *gate = new LlaisysTensor{gate_tensor};
+            // up [1, di]
+            auto up_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *up = new LlaisysTensor{up_tensor};
+
+            // linear projections
+            llaisysLinear(gate, mlp_in_t, model->weights.mlp_gate_w[i], nullptr);
+            llaisysLinear(up, mlp_in_t, model->weights.mlp_up_w[i], nullptr);
+
+            // swiglu -> act [1, di]
+            auto act_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *act = new LlaisysTensor{act_tensor};
+            llaisysSwiGLU(act, gate, up);
+
+            // down projection -> out [1, hs]
+            auto down_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *down = new LlaisysTensor{down_tensor};
+            llaisysLinear(down, act, model->weights.mlp_down_w[i], nullptr);
+
+            // residual: x = x + down
+            auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+            LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor};
+            llaisysAdd(new_x, x, down);
+
+            // swap x
+            delete x; x = new_x;
+
+            // cleanup temporaries
+            delete mlp_in_t;
+            delete gate;
+            delete up;
+            delete act;
+            delete down;
+            std::cerr << "[llaisys qwen2] layer " << i << " mlp done" << std::endl;
+        }
+
+        if (norm) delete norm;
+        std::cerr << "[llaisys qwen2] layer " << i << " end" << std::endl;
+    }
+
+    // logits
+    std::cerr << "[llaisys qwen2] before logits" << std::endl;
+    std::cerr << "[llaisys qwen2] x ptr=" << x << " out_embed ptr=" << model->weights.out_embed << std::endl;
+    std::vector<size_t> logits_shape = {1, model->meta.voc};
+    auto logits_tensor = Tensor::create(logits_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+    LlaisysTensor *logits = new LlaisysTensor{logits_tensor};
+    std::cerr << "[llaisys qwen2] calling llaisysLinear for logits" << std::endl;
+    llaisysLinear(logits, x, model->weights.out_embed, nullptr);
+    std::cerr << "[llaisys qwen2] after llaisysLinear for logits" << std::endl;
+
+    // argmax
+    std::vector<size_t> one_shape = {1};
+    auto max_idx_t = Tensor::create(one_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0);
+    auto max_val_t = Tensor::create(one_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
+    LlaisysTensor *max_idx = new LlaisysTensor{max_idx_t};
+    LlaisysTensor *max_val = new LlaisysTensor{max_val_t};
+    std::cerr << "[llaisys qwen2] calling argmax" << std::endl;
+    llaisysArgmax(max_idx, max_val, logits);
+    std::cerr << "[llaisys qwen2] after argmax" << std::endl;
+
+    int64_t *res_ptr = reinterpret_cast<int64_t *>(max_idx->tensor->data());
+    int64_t next = res_ptr ? res_ptr[0] : model->meta.end_token;
+
+    // cleanup
+    delete idx;
+    delete x;
+    delete logits;
+    delete max_idx;
+    delete max_val;
+
+    return next;
+}
+
+} // extern "C"
+
+// end of file
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a95..095d96875 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -106,6 +106,7 @@ target("llaisys")
     set_languages("cxx17")
     set_warnings("all", "error")
     add_files("src/llaisys/*.cc")
+    add_files("src/llaisys/qwen2.cc")
     set_installdir(".")
 
     

From 4c3f0e63a4c0aa6eb301bf515f34a43efe725da3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= <mengyijia49@gmail.com>
Date: Wed, 28 Jan 2026 15:00:04 +0800
Subject: [PATCH 5/9] fix(qwen2): include <cmath> for std::sqrt on Linux

---
 src/llaisys/qwen2.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index b57e1ab6c..1c8082415 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -1,6 +1,7 @@
 #include "llaisys/models/qwen2.h"
 #include "llaisys/tensor.h"
 #include "llaisys_tensor.hpp"
+#include <cmath>
 #include <cstring>
 #include <string>
 #include <unordered_map>

From 54c3ba7879ce6d00d81ac6147225ff17c54f5ef4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= <mengyijia49@gmail.com>
Date: Sun, 8 Mar 2026 20:27:30 +0800
Subject: [PATCH 6/9] feat: add nvidia op kernels and runtime integration

---
 python/llaisys/models/qwen2.py                | 123 ++++-
 ...nvidia_resource.cu => nvidia_resource.cpp} |   1 +
 src/device/nvidia/nvidia_runtime_api.cpp      | 123 +++++
 src/device/nvidia/nvidia_runtime_api.cu       |  75 ---
 src/llaisys/qwen2.cc                          | 458 ++++++++++--------
 src/ops/add/nvidia/add_nvidia.cu              |  42 ++
 src/ops/add/nvidia/add_nvidia.hpp             |   9 +
 src/ops/add/op.cpp                            |   6 +-
 src/ops/argmax/nvidia/argmax_nvidia.cu        |  77 +++
 src/ops/argmax/nvidia/argmax_nvidia.hpp       |   9 +
 src/ops/argmax/op.cpp                         |  19 +-
 src/ops/embedding/nvidia/embedding_nvidia.cu  |  65 +++
 src/ops/embedding/nvidia/embedding_nvidia.hpp |  15 +
 src/ops/embedding/op.cpp                      |  18 +-
 src/ops/linear/nvidia/linear_nvidia.cu        | 139 ++++++
 src/ops/linear/nvidia/linear_nvidia.hpp       |  17 +
 src/ops/linear/op.cpp                         |  19 +-
 src/ops/nvidia_cuda.cuh                       |  77 +++
 src/ops/rearrange/nvidia/rearrange_nvidia.cu  | 108 +++++
 src/ops/rearrange/nvidia/rearrange_nvidia.hpp |  16 +
 src/ops/rearrange/op.cpp                      |  18 +-
 src/ops/rms_norm/nvidia/rms_norm_nvidia.cu    |  85 ++++
 src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp   |  16 +
 src/ops/rms_norm/op.cpp                       |  18 +-
 src/ops/rope/nvidia/rope_nvidia.cu            |  92 ++++
 src/ops/rope/nvidia/rope_nvidia.hpp           |  17 +
 src/ops/rope/op.cpp                           |  18 +-
 .../nvidia/self_attention_nvidia.cu           | 149 ++++++
 .../nvidia/self_attention_nvidia.hpp          |  21 +
 src/ops/self_attention/op.cpp                 |  20 +-
 src/ops/swiglu/nvidia/swiglu_nvidia.cu        |  53 ++
 src/ops/swiglu/nvidia/swiglu_nvidia.hpp       |  14 +
 src/ops/swiglu/op.cpp                         |  18 +-
 test/ops/self_attention.py                    |   2 +-
 xmake.lua                                     |   8 +-
 xmake/nvidia.lua                              |  36 ++
 36 files changed, 1670 insertions(+), 331 deletions(-)
 rename src/device/nvidia/{nvidia_resource.cu => nvidia_resource.cpp} (86%)
 create mode 100644 src/device/nvidia/nvidia_runtime_api.cpp
 delete mode 100644 src/device/nvidia/nvidia_runtime_api.cu
 create mode 100644 src/ops/add/nvidia/add_nvidia.cu
 create mode 100644 src/ops/add/nvidia/add_nvidia.hpp
 create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.cu
 create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.hpp
 create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.cu
 create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.hpp
 create mode 100644 src/ops/linear/nvidia/linear_nvidia.cu
 create mode 100644 src/ops/linear/nvidia/linear_nvidia.hpp
 create mode 100644 src/ops/nvidia_cuda.cuh
 create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.cu
 create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.hpp
 create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
 create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp
 create mode 100644 src/ops/rope/nvidia/rope_nvidia.cu
 create mode 100644 src/ops/rope/nvidia/rope_nvidia.hpp
 create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.cu
 create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.hpp
 create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.cu
 create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.hpp
 create mode 100644 xmake/nvidia.lua

diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 033fcd9e3..aee1fb4b6 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -5,10 +5,13 @@
 from pathlib import Path
 
 import safetensors
-import numpy as np
 
 try:
     import torch
+except Exception:
+    torch = None
+
+try:
     from transformers import AutoModelForCausalLM
     HF_AVAILABLE = True
 except Exception:
@@ -19,6 +22,13 @@ class Qwen2:
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
         model_path = Path(model_path)
         self._backend_model = None
+        self._backend_kv = None
+        self._device = DeviceType.CPU if device == DeviceType.CPU else DeviceType.NVIDIA
+        self._device_id = 0
+        self._weight_tensors = []
+        self._maxseq = 2048
+        self._end_token = -1
+        self._dtype = DataType.F32
 
         # attempt to create backend model
         try:
@@ -32,7 +42,14 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
                     cfg = json.load(f)
 
             meta = LlaisysQwen2Meta()
-            meta.dtype = DataType.BF16
+            dtype_str = str(cfg.get("torch_dtype", "bfloat16")).lower()
+            if "bfloat16" in dtype_str:
+                self._dtype = DataType.BF16
+            elif "float16" in dtype_str or "half" in dtype_str:
+                self._dtype = DataType.F16
+            else:
+                self._dtype = DataType.F32
+            meta.dtype = self._dtype
             # robustly extract fields from config
             meta.nlayer = int(cfg.get("num_hidden_layers", cfg.get("n_layer", cfg.get("num_layers", 0))))
             meta.hs = int(cfg.get("hidden_size", cfg.get("d_model", 0)))
@@ -47,41 +64,62 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
             meta.epsilon = float(cfg.get("layer_norm_eps", cfg.get("eps", 1e-5)))
             meta.theta = float(cfg.get("rope_theta", cfg.get("theta", 10000.0)))
             meta.end_token = int(cfg.get("eos_token_id", cfg.get("end_token", -1)))
+            self._maxseq = int(meta.maxseq)
+            self._end_token = int(meta.end_token)
 
-            dev = DeviceType.CPU if device == DeviceType.CPU else DeviceType.NVIDIA
             import ctypes
 
-            self._backend_model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), dev, None, 0)
+            self._backend_model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), self._device, None, 0)
+
+            if torch is None:
+                raise RuntimeError("PyTorch is required to load safetensors weights")
+
+            if self._dtype == DataType.BF16:
+                target_torch_dtype = torch.bfloat16
+            elif self._dtype == DataType.F16:
+                target_torch_dtype = torch.float16
+            else:
+                target_torch_dtype = torch.float32
 
             for file in sorted(model_path.glob("*.safetensors")):
-                data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
+                data_ = safetensors.safe_open(file, framework="pt", device="cpu")
                 for name_ in data_.keys():
-                    arr = data_.get_tensor(name_)
-                    if not arr.flags["C_CONTIGUOUS"]:
-                        arr = np.ascontiguousarray(arr)
-                    if arr.dtype != np.float32:
-                        arr = arr.astype(np.float32)
-
-                    t = Tensor(shape=arr.shape, dtype=DataType.F32, device=DeviceType.CPU)
-                    t.load(arr.ctypes.data)
+                    ten = data_.get_tensor(name_).detach().cpu().contiguous()
+                    if ten.dtype != target_torch_dtype:
+                        ten = ten.to(target_torch_dtype)
+
+                    t = Tensor(shape=ten.shape, dtype=self._dtype, device=self._device, device_id=self._device_id)
+                    t.load(ten.data_ptr())
                     LIB_LLAISYS.llaisysQwen2ModelSetWeight(self._backend_model, name_.encode("utf-8"), t.lib_tensor())
+                    self._weight_tensors.append(t)
 
             LIB_LLAISYS.llaisysQwen2ModelFinalize(self._backend_model)
+            self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, self._maxseq)
 
             # verify some required weights exist (best-effort)
-            required = [b"model.norm.weight", b"embed_tokens.weight", b"lm_head.weight"]
+            required_groups = [
+                [b"model.norm.weight", b"norm.weight"],
+                [b"model.embed_tokens.weight", b"embed_tokens.weight"],
+                [b"lm_head.weight"],
+            ]
             missing = []
-            for r in required:
-                try:
-                    has = LIB_LLAISYS.llaisysQwen2ModelHasWeight(self._backend_model, r)
-                except Exception:
-                    has = 0
-                if not has:
-                    missing.append(r.decode("utf-8"))
+            for group in required_groups:
+                found = False
+                for r in group:
+                    try:
+                        has = LIB_LLAISYS.llaisysQwen2ModelHasWeight(self._backend_model, r)
+                    except Exception:
+                        has = 0
+                    if has:
+                        found = True
+                        break
+                if not found:
+                    missing.append(group[0].decode("utf-8"))
             if missing:
                 print("[llaisys qwen2] Warning: missing weights:", missing)
         except Exception as e:
             # backend unavailable or error during loading; fall back to HF
+            print(f"[llaisys qwen2] backend load failed: {e}")
             self._backend_model = None
 
         if self._backend_model is None:
@@ -103,9 +141,40 @@ def generate(
             import ctypes
             from ctypes import c_int64, c_size_t
 
-            arr = (c_int64 * len(inputs))(*inputs)
-            out = LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(inputs)))
-            return [int(out)]
+            input_ids = [int(t) for t in inputs]
+            if not input_ids:
+                return []
+
+            if max_new_tokens is None:
+                max_new_tokens = 128
+            max_new_tokens = int(max_new_tokens)
+            if max_new_tokens <= 0:
+                return input_ids
+
+            kv_cap = len(input_ids) + max_new_tokens
+            if self._maxseq > 0:
+                kv_cap = min(kv_cap, self._maxseq)
+            kv_cap = max(kv_cap, 1)
+
+            if self._backend_kv is not None:
+                LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv)
+                self._backend_kv = None
+            self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap))
+
+            arr = (c_int64 * len(input_ids))(*input_ids)
+            next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(input_ids))))
+
+            output_ids = list(input_ids)
+            for _ in range(max_new_tokens):
+                if next_token is None:
+                    break
+                output_ids.append(next_token)
+                if self._end_token >= 0 and next_token == self._end_token:
+                    break
+                arr = (c_int64 * 1)(next_token)
+                next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(1)))
+
+            return output_ids
 
         input_ids = torch.tensor([list(inputs)], dtype=torch.long, device=self.device)
         with torch.no_grad():
@@ -119,6 +188,12 @@ def generate(
         return outputs[0].tolist()
 
     def __del__(self):
+        try:
+            if self._backend_kv is not None:
+                LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv)
+                self._backend_kv = None
+        except Exception:
+            pass
         try:
             if self._backend_model is not None:
                 LIB_LLAISYS.llaisysQwen2ModelDestroy(self._backend_model)
diff --git a/src/device/nvidia/nvidia_resource.cu b/src/device/nvidia/nvidia_resource.cpp
similarity index 86%
rename from src/device/nvidia/nvidia_resource.cu
rename to src/device/nvidia/nvidia_resource.cpp
index 2e63647e5..84c59c6fb 100644
--- a/src/device/nvidia/nvidia_resource.cu
+++ b/src/device/nvidia/nvidia_resource.cpp
@@ -3,5 +3,6 @@
 namespace llaisys::device::nvidia {
 
 Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {}
+Resource::~Resource() = default;
 
 } // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nvidia_runtime_api.cpp b/src/device/nvidia/nvidia_runtime_api.cpp
new file mode 100644
index 000000000..53da4bf77
--- /dev/null
+++ b/src/device/nvidia/nvidia_runtime_api.cpp
@@ -0,0 +1,123 @@
+#include "../runtime_api.hpp"
+
+#include <cstdlib>
+#include <cstring>
+
+#include <cuda_runtime.h>
+
+namespace llaisys::device::nvidia {
+
+namespace runtime_api {
+static inline void check_cuda(cudaError_t err, const char *msg) {
+    if (err != cudaSuccess) {
+        std::cerr << "[CUDA ERROR] " << msg << ": " << cudaGetErrorString(err) << std::endl;
+        throw std::runtime_error("CUDA runtime API failed");
+    }
+}
+
+static inline cudaMemcpyKind to_cuda_memcpy_kind(llaisysMemcpyKind_t kind) {
+    switch (kind) {
+    case LLAISYS_MEMCPY_H2H:
+        return cudaMemcpyHostToHost;
+    case LLAISYS_MEMCPY_H2D:
+        return cudaMemcpyHostToDevice;
+    case LLAISYS_MEMCPY_D2H:
+        return cudaMemcpyDeviceToHost;
+    case LLAISYS_MEMCPY_D2D:
+        return cudaMemcpyDeviceToDevice;
+    default:
+        throw std::invalid_argument("Unsupported memcpy kind");
+    }
+}
+
+int getDeviceCount() {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
+    if (err == cudaErrorNoDevice || err == cudaErrorInsufficientDriver) {
+        return 0;
+    }
+    check_cuda(err, "cudaGetDeviceCount");
+    return count;
+}
+
+void setDevice(int device_id) {
+    check_cuda(cudaSetDevice(device_id), "cudaSetDevice");
+}
+
+void deviceSynchronize() {
+    check_cuda(cudaDeviceSynchronize(), "cudaDeviceSynchronize");
+}
+
+llaisysStream_t createStream() {
+    cudaStream_t stream = nullptr;
+    check_cuda(cudaStreamCreate(&stream), "cudaStreamCreate");
+    return reinterpret_cast<llaisysStream_t>(stream);
+}
+
+void destroyStream(llaisysStream_t stream) {
+    if (stream == nullptr) {
+        return;
+    }
+    check_cuda(cudaStreamDestroy(reinterpret_cast<cudaStream_t>(stream)), "cudaStreamDestroy");
+}
+
+void streamSynchronize(llaisysStream_t stream) {
+    check_cuda(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)), "cudaStreamSynchronize");
+}
+
+void *mallocDevice(size_t size) {
+    void *ptr = nullptr;
+    check_cuda(cudaMalloc(&ptr, size), "cudaMalloc");
+    return ptr;
+}
+
+void freeDevice(void *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
+    check_cuda(cudaFree(ptr), "cudaFree");
+}
+
+void *mallocHost(size_t size) {
+    void *ptr = nullptr;
+    check_cuda(cudaMallocHost(&ptr, size), "cudaMallocHost");
+    return ptr;
+}
+
+void freeHost(void *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
+    check_cuda(cudaFreeHost(ptr), "cudaFreeHost");
+}
+
+void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
+    check_cuda(cudaMemcpy(dst, src, size, to_cuda_memcpy_kind(kind)), "cudaMemcpy");
+}
+
+void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) {
+    check_cuda(
+        cudaMemcpyAsync(dst, src, size, to_cuda_memcpy_kind(kind), reinterpret_cast<cudaStream_t>(stream)),
+        "cudaMemcpyAsync");
+}
+
+static const LlaisysRuntimeAPI RUNTIME_API = {
+    &getDeviceCount,
+    &setDevice,
+    &deviceSynchronize,
+    &createStream,
+    &destroyStream,
+    &streamSynchronize,
+    &mallocDevice,
+    &freeDevice,
+    &mallocHost,
+    &freeHost,
+    &memcpySync,
+    &memcpyAsync};
+
+} // namespace runtime_api
+
+const LlaisysRuntimeAPI *getRuntimeAPI() {
+    return &runtime_api::RUNTIME_API;
+}
+} // namespace llaisys::device::nvidia
diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu
deleted file mode 100644
index cab928261..000000000
--- a/src/device/nvidia/nvidia_runtime_api.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "../runtime_api.hpp"
-
-#include <cstdlib>
-#include <cstring>
-
-namespace llaisys::device::nvidia {
-
-namespace runtime_api {
-int getDeviceCount() {
-    TO_BE_IMPLEMENTED();
-}
-
-void setDevice(int) {
-    TO_BE_IMPLEMENTED();
-}
-
-void deviceSynchronize() {
-    TO_BE_IMPLEMENTED();
-}
-
-llaisysStream_t createStream() {
-    TO_BE_IMPLEMENTED();
-}
-
-void destroyStream(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
-}
-void streamSynchronize(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
-}
-
-void *mallocDevice(size_t size) {
-    TO_BE_IMPLEMENTED();
-}
-
-void freeDevice(void *ptr) {
-    TO_BE_IMPLEMENTED();
-}
-
-void *mallocHost(size_t size) {
-    TO_BE_IMPLEMENTED();
-}
-
-void freeHost(void *ptr) {
-    TO_BE_IMPLEMENTED();
-}
-
-void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
-}
-
-void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
-}
-
-static const LlaisysRuntimeAPI RUNTIME_API = {
-    &getDeviceCount,
-    &setDevice,
-    &deviceSynchronize,
-    &createStream,
-    &destroyStream,
-    &streamSynchronize,
-    &mallocDevice,
-    &freeDevice,
-    &mallocHost,
-    &freeHost,
-    &memcpySync,
-    &memcpyAsync};
-
-} // namespace runtime_api
-
-const LlaisysRuntimeAPI *getRuntimeAPI() {
-    return &runtime_api::RUNTIME_API;
-}
-} // namespace llaisys::device::nvidia
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index 1c8082415..4166fe18f 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -9,9 +9,12 @@
 #include <vector>
 #include <iostream>
 #include "llaisys/ops.h"
+#include "../core/llaisys_core.hpp"
 
 struct LlaisysQwen2Model {
     LlaisysQwen2Meta meta;
+    llaisysDeviceType_t device_type = LLAISYS_DEVICE_CPU;
+    int device_id = 0;
     LlaisysQwen2Weights weights;
     // store arbitrary named tensors provided from python
     std::unordered_map<std::string, llaisysTensor_t> weight_map;
@@ -26,12 +29,18 @@ __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Met
     if (!meta) return nullptr;
     LlaisysQwen2Model *m = new LlaisysQwen2Model();
     m->meta = *meta;
+    m->device_type = device;
+    m->device_id = (device_ids != nullptr && ndevice > 0) ? device_ids[0] : 0;
     memset(&m->weights, 0, sizeof(m->weights));
     return m;
 }
 
 __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) {
     if (!model) return;
+    if (model->kv_cache) {
+        llaisysQwen2KVDestroy((void *)model->kv_cache);
+        model->kv_cache = nullptr;
+    }
     model->weight_map.clear();
     if (model->weights.attn_norm_w) delete[] model->weights.attn_norm_w;
     if (model->weights.attn_q_w) delete[] model->weights.attn_q_w;
@@ -71,8 +80,29 @@ static bool tensor_matches_shape_and_dtype(llaisysTensor_t t, const std::vector<
     }
 }
 
+static void ensure_layer_weight_arrays(LlaisysQwen2Model *model) {
+    if (!model || model->meta.nlayer == 0) {
+        return;
+    }
+
+    size_t n = model->meta.nlayer;
+    if (!model->weights.attn_norm_w) model->weights.attn_norm_w = new llaisysTensor_t[n]();
+    if (!model->weights.attn_q_w) model->weights.attn_q_w = new llaisysTensor_t[n]();
+    if (!model->weights.attn_q_b) model->weights.attn_q_b = new llaisysTensor_t[n]();
+    if (!model->weights.attn_k_w) model->weights.attn_k_w = new llaisysTensor_t[n]();
+    if (!model->weights.attn_k_b) model->weights.attn_k_b = new llaisysTensor_t[n]();
+    if (!model->weights.attn_v_w) model->weights.attn_v_w = new llaisysTensor_t[n]();
+    if (!model->weights.attn_v_b) model->weights.attn_v_b = new llaisysTensor_t[n]();
+    if (!model->weights.attn_o_w) model->weights.attn_o_w = new llaisysTensor_t[n]();
+    if (!model->weights.mlp_norm_w) model->weights.mlp_norm_w = new llaisysTensor_t[n]();
+    if (!model->weights.mlp_gate_w) model->weights.mlp_gate_w = new llaisysTensor_t[n]();
+    if (!model->weights.mlp_up_w) model->weights.mlp_up_w = new llaisysTensor_t[n]();
+    if (!model->weights.mlp_down_w) model->weights.mlp_down_w = new llaisysTensor_t[n]();
+}
+
 __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor) {
     if (!model || !name) return -1;
+    ensure_layer_weight_arrays(model);
     std::string sname(name);
     model->weight_map[sname] = tensor;
 
@@ -89,7 +119,7 @@ __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const
         else std::cerr << "[llaisys qwen2] Warning: out_embed shape/dtype mismatch for " << sname << std::endl;
         return 0;
     }
-    if (str_contains_any(sname, {"model.norm.weight", "ln_f.weight", "final_layernorm.weight", "out_norm.weight", "norm.weight"})) {
+    if (sname == "norm.weight" || str_contains_any(sname, {"model.norm.weight", "ln_f.weight", "final_layernorm.weight", "out_norm.weight"})) {
         // avoid matching layer-norm inside layers by checking common patterns
         std::vector<size_t> exp = {model->meta.hs};
         if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.out_norm_w = tensor;
@@ -116,8 +146,17 @@ __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const
         size_t dh = model->meta.dh;
         size_t di = model->meta.di;
 
+        // post-attention norm
+        if (str_contains_any(sname, {"post_attention_layernorm.weight", "ln_2.weight", "layernorm_after.weight"})) {
+            std::vector<size_t> exp = {hs};
+            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_norm_w[idx] = tensor;
+            else std::cerr << "[llaisys qwen2] Warning: mlp_norm_w["<<idx<<"] shape/dtype mismatch" << std::endl;
+            return 0;
+        }
+
         // input layer norm weight [hs]
-        if (str_contains_any(sname, {"input_layernorm.weight", "ln_1.weight", "layernorm_before.weight", "attention_layernorm.weight"})) {
+        if (str_contains_any(sname, {"input_layernorm.weight", "ln_1.weight", "layernorm_before.weight"}) ||
+            (sname.find("attention_layernorm.weight") != std::string::npos && sname.find("post_attention_layernorm.weight") == std::string::npos)) {
             std::vector<size_t> exp = {hs};
             if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_norm_w[idx] = tensor;
             else std::cerr << "[llaisys qwen2] Warning: attn_norm_w["<<idx<<"] shape/dtype mismatch" << std::endl;
@@ -174,14 +213,6 @@ __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const
             return 0;
         }
 
-        // post-attention norm
-        if (str_contains_any(sname, {"post_attention_layernorm.weight", "ln_2.weight", "layernorm_after.weight"})) {
-            std::vector<size_t> exp = {hs};
-            if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_norm_w[idx] = tensor;
-            else std::cerr << "[llaisys qwen2] Warning: mlp_norm_w["<<idx<<"] shape/dtype mismatch" << std::endl;
-            return 0;
-        }
-
         // MLP gate/up/down
         if (str_contains_any(sname, {"mlp.gate_proj.weight", "gate_proj.weight", "dense_h_to_4h.weight", "gate.weight"})) {
             std::vector<size_t> exp = {di, hs};
@@ -265,41 +296,91 @@ __export int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model) {
 
 // Simple KV cache structure and APIs
 struct KVCache {
-    size_t max_tokens;
-    std::vector<llaisysTensor_t> keys;
-    std::vector<llaisysTensor_t> vals;
+    LlaisysQwen2Model *owner = nullptr;
+    size_t max_tokens = 0;
+    size_t nlayer = 0;
+    std::vector<std::vector<llaisysTensor_t>> keys;
+    std::vector<std::vector<llaisysTensor_t>> vals;
 };
 
+static llaisysTensor_t make_tensor(const std::vector<size_t> &shape, llaisysDataType_t dtype, llaisysDeviceType_t device, int device_id) {
+    return new LlaisysTensor{llaisys::Tensor::create(shape, dtype, device, device_id)};
+}
+
+static void clear_layer_cache(std::vector<llaisysTensor_t> &cache) {
+    for (auto *t : cache) {
+        delete t;
+    }
+    cache.clear();
+}
+
+static void kv_append_for_layer(KVCache *kv, size_t layer_idx, llaisysTensor_t k, llaisysTensor_t v) {
+    if (!kv || layer_idx >= kv->nlayer || !k || !v) {
+        return;
+    }
+    auto &ks = kv->keys[layer_idx];
+    auto &vs = kv->vals[layer_idx];
+
+    if (kv->max_tokens > 0 && ks.size() >= kv->max_tokens) {
+        delete ks.front();
+        ks.erase(ks.begin());
+        delete vs.front();
+        vs.erase(vs.begin());
+    }
+
+    ks.push_back(k);
+    vs.push_back(v);
+}
+
 __export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens) {
+    if (!model) return nullptr;
+
+    if (model->kv_cache) {
+        llaisysQwen2KVDestroy((void *)model->kv_cache);
+    }
+
     KVCache *kv = new KVCache();
+    kv->owner = model;
     kv->max_tokens = max_tokens;
-    kv->keys.reserve(max_tokens);
-    kv->vals.reserve(max_tokens);
-    if (model) model->kv_cache = kv;
+    kv->nlayer = model->meta.nlayer;
+    kv->keys.resize(kv->nlayer);
+    kv->vals.resize(kv->nlayer);
+    for (size_t i = 0; i < kv->nlayer; ++i) {
+        kv->keys[i].reserve(max_tokens);
+        kv->vals[i].reserve(max_tokens);
+    }
+    model->kv_cache = kv;
     return (void *)kv;
 }
 
 __export void llaisysQwen2KVDestroy(void *kv) {
     if (!kv) return;
     KVCache *c = (KVCache *)kv;
+    for (size_t i = 0; i < c->nlayer; ++i) {
+        clear_layer_cache(c->keys[i]);
+        clear_layer_cache(c->vals[i]);
+    }
     c->keys.clear();
     c->vals.clear();
+    if (c->owner && c->owner->kv_cache == c) {
+        c->owner->kv_cache = nullptr;
+    }
     delete c;
 }
 
 __export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v) {
     if (!kv) return -1;
     KVCache *c = (KVCache *)kv;
-    if (c->keys.size() >= c->max_tokens) return -1;
-    c->keys.push_back(k);
-    c->vals.push_back(v);
+    if (c->nlayer == 0) return -1;
+    kv_append_for_layer(c, 0, k, v);
     return 0;
 }
 
 __export size_t llaisysQwen2KVLen(void *kv) {
     if (!kv) return 0;
     KVCache *c = (KVCache *)kv;
-    return c->keys.size();
+    if (c->nlayer == 0) return 0;
+    return c->keys[0].size();
 }
 
 __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name) {
@@ -309,244 +390,188 @@ __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, co
     return it != model->weight_map.end() ? 1 : 0;
 }
 
-__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) {
-    // More complete per-layer feedforward inference (attention skipped):
-    if (!model) return -1;
-    std::cerr << "[llaisys qwen2] infer entry" << std::endl;
-    if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token;
-
-    int64_t last = token_ids[ntoken - 1];
+static int64_t infer_one_token(struct LlaisysQwen2Model *model, int64_t token_id) {
+    using namespace llaisys;
 
-    // require embedding and out projection
     if (!model->weights.in_embed || !model->weights.out_embed) {
-        return last;
+        return token_id;
     }
 
-    using namespace llaisys;
+    llaisys::core::context().setDevice(model->device_type, model->device_id);
+    const LlaisysRuntimeAPI *runtime_api = llaisys::core::context().runtime().api();
+
+    const llaisysDataType_t dtype = model->meta.dtype;
+    const size_t hs = model->meta.hs;
+    const size_t nh = model->meta.nh;
+    const size_t nkvh = model->meta.nkvh;
+    const size_t dh = model->meta.dh;
+    const size_t di = model->meta.di;
+    const std::vector<size_t> one_shape = {1};
+    const std::vector<size_t> hidden_shape = {1, hs};
+    const std::vector<size_t> q_shape = {1, nh, dh};
+    const std::vector<size_t> kv_shape = {1, nkvh, dh};
 
-    // create index tensor [1]
-    std::vector<size_t> idx_shape = {1};
-    auto idx_tensor = Tensor::create(idx_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0);
-    LlaisysTensor *idx = new LlaisysTensor{idx_tensor};
-    idx->tensor->load(&last);
+    auto idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
+    idx->tensor->load(&token_id);
 
-    // embedding -> x [1, hs]
-    std::vector<size_t> emb_shape = {1, model->meta.hs};
-    auto x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-    LlaisysTensor *x = new LlaisysTensor{x_tensor};
+    auto x = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
     llaisysEmbedding(x, idx, model->weights.in_embed);
-    std::cerr << "[llaisys qwen2] after embedding" << std::endl;
 
-    // per-layer processing
-    size_t n = model->meta.nlayer;
-    for (size_t i = 0; i < n; ++i) {
-        std::cerr << "[llaisys qwen2] layer " << i << " start" << std::endl;
-        // optional rms_norm -> normed [1, hs]
-        LlaisysTensor *norm = nullptr;
-        if (model->weights.attn_norm_w && model->weights.attn_norm_w[i]) {
-            auto norm_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            norm = new LlaisysTensor{norm_tensor};
-            llaisysRmsNorm(norm, x, model->weights.attn_norm_w[i], model->meta.epsilon);
-        }
+    KVCache *kv = model->kv_cache;
 
-        // Attention: compute q,k,v, append to KV and run self-attention
-        bool has_attn = model->weights.attn_norm_w && model->weights.attn_q_w && model->weights.attn_k_w && model->weights.attn_v_w && model->weights.attn_o_w &&
-                        model->weights.attn_norm_w[i] && model->weights.attn_q_w[i] && model->weights.attn_k_w[i] && model->weights.attn_v_w[i] && model->weights.attn_o_w[i];
+    for (size_t i = 0; i < model->meta.nlayer; ++i) {
+        bool has_attn = model->weights.attn_norm_w && model->weights.attn_q_w && model->weights.attn_k_w && model->weights.attn_v_w &&
+                        model->weights.attn_o_w && model->weights.attn_norm_w[i] && model->weights.attn_q_w[i] &&
+                        model->weights.attn_k_w[i] && model->weights.attn_v_w[i] && model->weights.attn_o_w[i];
 
         if (has_attn) {
-            // q: project norm -> [1, nh*dh] then view [1, nh, dh]
-            size_t nh = model->meta.nh;
-            size_t dh = model->meta.dh;
-            size_t nkv = model->meta.nkvh;
+            auto norm1 = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
+            llaisysRmsNorm(norm1, x, model->weights.attn_norm_w[i], model->meta.epsilon);
 
-            std::vector<size_t> qflat_shape = {1, nh * dh};
-            auto qflat_tensor = Tensor::create(qflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *qflat = new LlaisysTensor{qflat_tensor};
-            llaisysLinear(qflat, norm ? norm : x, model->weights.attn_q_w[i], model->weights.attn_q_b && model->weights.attn_q_b[i] ? model->weights.attn_q_b[i] : nullptr);
-            // view to [1, nh, dh]
-            LlaisysTensor *q = nullptr;
-            try {
-                auto q_view = qflat->tensor->view({1, nh, dh});
-                q = new LlaisysTensor{q_view};
-            } catch (...) {
-                delete qflat; qflat = nullptr;
-            }
+            auto qflat = make_tensor({1, nh * dh}, dtype, model->device_type, model->device_id);
+            auto kflat = make_tensor({1, nkvh * dh}, dtype, model->device_type, model->device_id);
+            auto vflat = make_tensor({1, nkvh * dh}, dtype, model->device_type, model->device_id);
 
-            // k: project norm -> [1, nkv*dh] then view [1, nkv, dh]
-            std::vector<size_t> kflat_shape = {1, nkv * dh};
-            auto kflat_tensor = Tensor::create(kflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *kflat = new LlaisysTensor{kflat_tensor};
-            llaisysLinear(kflat, norm ? norm : x, model->weights.attn_k_w[i], model->weights.attn_k_b && model->weights.attn_k_b[i] ? model->weights.attn_k_b[i] : nullptr);
-            LlaisysTensor *k = nullptr;
-            try {
-                auto k_view = kflat->tensor->view({1, nkv, dh});
-                k = new LlaisysTensor{k_view};
-            } catch (...) {
-                delete kflat; kflat = nullptr;
-            }
+            llaisysLinear(qflat, norm1, model->weights.attn_q_w[i], model->weights.attn_q_b ? model->weights.attn_q_b[i] : nullptr);
+            llaisysLinear(kflat, norm1, model->weights.attn_k_w[i], model->weights.attn_k_b ? model->weights.attn_k_b[i] : nullptr);
+            llaisysLinear(vflat, norm1, model->weights.attn_v_w[i], model->weights.attn_v_b ? model->weights.attn_v_b[i] : nullptr);
 
-            // v: project norm -> [1, nkv*dh] then view [1, nkv, dh]
-            std::vector<size_t> vflat_shape = {1, nkv * dh};
-            auto vflat_tensor = Tensor::create(vflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *vflat = new LlaisysTensor{vflat_tensor};
-            llaisysLinear(vflat, norm ? norm : x, model->weights.attn_v_w[i], model->weights.attn_v_b && model->weights.attn_v_b[i] ? model->weights.attn_v_b[i] : nullptr);
-            LlaisysTensor *v = nullptr;
-            try {
-                auto v_view = vflat->tensor->view({1, nkv, dh});
-                v = new LlaisysTensor{v_view};
-            } catch (...) {
-                delete vflat; vflat = nullptr;
+            auto q = new LlaisysTensor{qflat->tensor->view(q_shape)};
+            auto k = new LlaisysTensor{kflat->tensor->view(kv_shape)};
+            auto v = new LlaisysTensor{vflat->tensor->view(kv_shape)};
+
+            int64_t pos = 0;
+            if (kv && i < kv->nlayer) {
+                pos = static_cast<int64_t>(kv->keys[i].size());
             }
 
-            // append k/v to kv cache if exists (or create ephemeral vectors)
-            KVCache *kv = model->kv_cache;
-            if (kv) {
-                // note: KVAppend stores handles, we transfer ownership semantics to KVCache (do not delete appended tensors here)
-                llaisysQwen2KVAppend(kv, k, v);
+            auto pos_ids = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
+            pos_ids->tensor->load(&pos);
+
+            auto q_rope = make_tensor(q_shape, dtype, model->device_type, model->device_id);
+            auto k_rope = make_tensor(kv_shape, dtype, model->device_type, model->device_id);
+            llaisysROPE(q_rope, q, pos_ids, model->meta.theta);
+            llaisysROPE(k_rope, k, pos_ids, model->meta.theta);
+
+            bool saved_in_cache = false;
+            if (kv && i < kv->nlayer) {
+                kv_append_for_layer(kv, i, k_rope, v);
+                saved_in_cache = true;
             }
 
-            // build k_all and v_all from KV entries
-            size_t total_len = 1;
-            std::vector<LlaisysTensor *> kv_keys;
-            std::vector<LlaisysTensor *> kv_vals;
-            if (kv && !kv->keys.empty()) {
-                total_len = kv->keys.size();
-                kv_keys = kv->keys;
-                kv_vals = kv->vals;
+            std::vector<llaisysTensor_t> local_keys;
+            std::vector<llaisysTensor_t> local_vals;
+            const std::vector<llaisysTensor_t> *k_src = nullptr;
+            const std::vector<llaisysTensor_t> *v_src = nullptr;
+            if (saved_in_cache) {
+                k_src = &kv->keys[i];
+                v_src = &kv->vals[i];
             } else {
-                total_len = 1;
-                kv_keys = {k};
-                kv_vals = {v};
+                local_keys.push_back(k_rope);
+                local_vals.push_back(v);
+                k_src = &local_keys;
+                v_src = &local_vals;
             }
 
-            // create k_all [total_len, nkv, dh], v_all [total_len, nkv, dh]
-            std::vector<size_t> k_all_shape = {total_len, nkv, dh};
-            auto k_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            auto v_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *k_all = new LlaisysTensor{k_all_t};
-            LlaisysTensor *v_all = new LlaisysTensor{v_all_t};
-
-            // copy each stored key/val into k_all/v_all
+            const size_t total_len = k_src->size();
+            auto k_all = make_tensor({total_len, nkvh, dh}, dtype, model->device_type, model->device_id);
+            auto v_all = make_tensor({total_len, nkvh, dh}, dtype, model->device_type, model->device_id);
+            const size_t bytes_per_token = nkvh * dh * k_all->tensor->elementSize();
             for (size_t j = 0; j < total_len; ++j) {
-                // compute byte size per entry
-                size_t elems = nkv * dh;
-                size_t bytes = elems * model->weights.in_embed->tensor->elementSize();
-                // source pointers
-                const std::byte *src_k = kv_keys[j]->tensor->data();
-                const std::byte *src_v = kv_vals[j]->tensor->data();
-                std::byte *dst_k = k_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize();
-                std::byte *dst_v = v_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize();
-                // memcpy (works for CPU)
-                std::memcpy(dst_k, src_k, bytes);
-                std::memcpy(dst_v, src_v, bytes);
+                runtime_api->memcpy_sync(k_all->tensor->data() + j * bytes_per_token, (*k_src)[j]->tensor->data(), bytes_per_token, LLAISYS_MEMCPY_D2D);
+                runtime_api->memcpy_sync(v_all->tensor->data() + j * bytes_per_token, (*v_src)[j]->tensor->data(), bytes_per_token, LLAISYS_MEMCPY_D2D);
             }
 
-            // prepare attn_out [1, nh, dh]
-            std::vector<size_t> attn_shape = {1, nh, dh};
-            auto attn_t = Tensor::create(attn_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *attn_out = new LlaisysTensor{attn_t};
-
-            float scale = 1.0f / std::sqrt((float)dh);
-            llaisysSelfAttention(attn_out, q, k_all, v_all, scale);
-
-            // flatten attn_out to [1, hs] and add to x
-            try {
-                auto attn_flat = attn_out->tensor->view({1, model->meta.hs});
-                LlaisysTensor *attn_flat_t = new LlaisysTensor{attn_flat};
-                auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-                LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor};
-                llaisysAdd(new_x, x, attn_flat_t);
-                delete x; x = new_x;
-                delete attn_flat_t;
-            } catch (...) {
-                // ignore
+            auto attn_val = make_tensor(q_shape, dtype, model->device_type, model->device_id);
+            const float scale = 1.0f / std::sqrt(static_cast<float>(dh));
+            llaisysSelfAttention(attn_val, q_rope, k_all, v_all, scale);
+
+            auto attn_flat = new LlaisysTensor{attn_val->tensor->view(hidden_shape)};
+            auto attn_out = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
+            llaisysLinear(attn_out, attn_flat, model->weights.attn_o_w[i], nullptr);
+
+            auto x_next = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
+            llaisysAdd(x_next, x, attn_out);
+            delete x;
+            x = x_next;
+
+            delete norm1;
+            delete qflat;
+            delete kflat;
+            delete vflat;
+            delete q;
+            delete k;
+            delete pos_ids;
+            delete q_rope;
+            delete k_all;
+            delete v_all;
+            delete attn_val;
+            delete attn_flat;
+            delete attn_out;
+
+            if (!saved_in_cache) {
+                delete k_rope;
+                delete v;
             }
-
-            // cleanup temporaries we own (do not delete kv-stored tensors)
-            delete qflat; if (q) delete q;
-            if (!kv) { delete kflat; if (k) delete k; delete vflat; if (v) delete v; }
-            delete k_all; delete v_all; delete attn_out;
         }
 
-        // MLP: only run if the expected mlp weights are present
         bool has_mlp = model->weights.mlp_norm_w && model->weights.mlp_gate_w && model->weights.mlp_up_w && model->weights.mlp_down_w &&
-                       model->weights.mlp_norm_w[i] && model->weights.mlp_gate_w[i] && model->weights.mlp_up_w[i] && model->weights.mlp_down_w[i];
+                       model->weights.mlp_norm_w[i] && model->weights.mlp_gate_w[i] && model->weights.mlp_up_w[i] &&
+                       model->weights.mlp_down_w[i];
 
         if (has_mlp) {
-            std::cerr << "[llaisys qwen2] layer " << i << " mlp present" << std::endl;
-            auto mlp_in = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *mlp_in_t = new LlaisysTensor{mlp_in};
-            llaisysRmsNorm(mlp_in_t, x, model->weights.mlp_norm_w[i], model->meta.epsilon);
-
-            // gate [1, di]
-            std::vector<size_t> gate_shape = {1, model->meta.di};
-            auto gate_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *gate = new LlaisysTensor{gate_tensor};
-            // up [1, di]
-            auto up_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *up = new LlaisysTensor{up_tensor};
-
-            // linear projections
-            llaisysLinear(gate, mlp_in_t, model->weights.mlp_gate_w[i], nullptr);
-            llaisysLinear(up, mlp_in_t, model->weights.mlp_up_w[i], nullptr);
-
-            // swiglu -> act [1, di]
-            auto act_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *act = new LlaisysTensor{act_tensor};
-            llaisysSwiGLU(act, gate, up);
+            auto norm2 = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
+            llaisysRmsNorm(norm2, x, model->weights.mlp_norm_w[i], model->meta.epsilon);
 
-            // down projection -> out [1, hs]
-            auto down_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *down = new LlaisysTensor{down_tensor};
-            llaisysLinear(down, act, model->weights.mlp_down_w[i], nullptr);
+            auto gate = make_tensor({1, di}, dtype, model->device_type, model->device_id);
+            auto up = make_tensor({1, di}, dtype, model->device_type, model->device_id);
+            auto act = make_tensor({1, di}, dtype, model->device_type, model->device_id);
+            auto down = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
 
-            // residual: x = x + down
-            auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-            LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor};
-            llaisysAdd(new_x, x, down);
+            llaisysLinear(gate, norm2, model->weights.mlp_gate_w[i], nullptr);
+            llaisysLinear(up, norm2, model->weights.mlp_up_w[i], nullptr);
+            llaisysSwiGLU(act, gate, up);
+            llaisysLinear(down, act, model->weights.mlp_down_w[i], nullptr);
 
-            // swap x
-            delete x; x = new_x;
+            auto x_next = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
+            llaisysAdd(x_next, x, down);
+            delete x;
+            x = x_next;
 
-            // cleanup temporaries
-            delete mlp_in_t;
+            delete norm2;
             delete gate;
             delete up;
             delete act;
             delete down;
-            std::cerr << "[llaisys qwen2] layer " << i << " mlp done" << std::endl;
         }
+    }
 
-        if (norm) delete norm;
-        std::cerr << "[llaisys qwen2] layer " << i << " end" << std::endl;
+    llaisysTensor_t logits_in = x;
+    llaisysTensor_t out_norm = nullptr;
+    if (model->weights.out_norm_w) {
+        out_norm = make_tensor(hidden_shape, dtype, model->device_type, model->device_id);
+        llaisysRmsNorm(out_norm, x, model->weights.out_norm_w, model->meta.epsilon);
+        logits_in = out_norm;
     }
 
-    // logits
-    std::cerr << "[llaisys qwen2] before logits" << std::endl;
-    std::cerr << "[llaisys qwen2] x ptr=" << x << " out_embed ptr=" << model->weights.out_embed << std::endl;
-    std::vector<size_t> logits_shape = {1, model->meta.voc};
-    auto logits_tensor = Tensor::create(logits_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-    LlaisysTensor *logits = new LlaisysTensor{logits_tensor};
-    std::cerr << "[llaisys qwen2] calling llaisysLinear for logits" << std::endl;
-    llaisysLinear(logits, x, model->weights.out_embed, nullptr);
-    std::cerr << "[llaisys qwen2] after llaisysLinear for logits" << std::endl;
-
-    // argmax
-    std::vector<size_t> one_shape = {1};
-    auto max_idx_t = Tensor::create(one_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0);
-    auto max_val_t = Tensor::create(one_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0);
-    LlaisysTensor *max_idx = new LlaisysTensor{max_idx_t};
-    LlaisysTensor *max_val = new LlaisysTensor{max_val_t};
-    std::cerr << "[llaisys qwen2] calling argmax" << std::endl;
+    auto logits = make_tensor({1, model->meta.voc}, dtype, model->device_type, model->device_id);
+    llaisysLinear(logits, logits_in, model->weights.out_embed, nullptr);
+
+    auto max_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
+    auto max_val = make_tensor(one_shape, dtype, model->device_type, model->device_id);
     llaisysArgmax(max_idx, max_val, logits);
-    std::cerr << "[llaisys qwen2] after argmax" << std::endl;
 
-    int64_t *res_ptr = reinterpret_cast<int64_t *>(max_idx->tensor->data());
-    int64_t next = res_ptr ? res_ptr[0] : model->meta.end_token;
+    int64_t next = model->meta.end_token;
+    if (max_idx->tensor->deviceType() == LLAISYS_DEVICE_CPU) {
+        int64_t *res_ptr = reinterpret_cast<int64_t *>(max_idx->tensor->data());
+        next = res_ptr ? res_ptr[0] : model->meta.end_token;
+    } else {
+        runtime_api->memcpy_sync(&next, max_idx->tensor->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H);
+    }
 
-    // cleanup
     delete idx;
     delete x;
+    if (out_norm) delete out_norm;
     delete logits;
     delete max_idx;
     delete max_val;
@@ -554,6 +579,17 @@ __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_
     return next;
 }
 
+__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) {
+    if (!model) return -1;
+    if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token;
+
+    int64_t next = model->meta.end_token;
+    for (size_t i = 0; i < ntoken; ++i) {
+        next = infer_one_token(model, token_ids[i]);
+    }
+    return next;
+}
+
 } // extern "C"
 
 // end of file
diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu
new file mode 100644
index 000000000..372106a2f
--- /dev/null
+++ b/src/ops/add/nvidia/add_nvidia.cu
@@ -0,0 +1,42 @@
+#include "add_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+    c[idx] = from_float<T>(to_float(a[idx]) + to_float(b[idx]));
+}
+
+template <typename T>
+void launch_add(std::byte *c, const std::byte *a, const std::byte *b, size_t numel) {
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(numel, threads);
+    auto stream = current_stream();
+    add_kernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(c),
+        reinterpret_cast<const T *>(a),
+        reinterpret_cast<const T *>(b),
+        numel);
+    check_cuda(cudaGetLastError(), "add_kernel launch");
+}
+
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_add<float>(c, a, b, numel);
+    case LLAISYS_DTYPE_F16:
+        return launch_add<llaisys::fp16_t>(c, a, b, numel);
+    case LLAISYS_DTYPE_BF16:
+        return launch_add<llaisys::bf16_t>(c, a, b, numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/add/nvidia/add_nvidia.hpp b/src/ops/add/nvidia/add_nvidia.hpp
new file mode 100644
index 000000000..2e67b497a
--- /dev/null
+++ b/src/ops/add/nvidia/add_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp
index a057330d7..7f7b40131 100644
--- a/src/ops/add/op.cpp
+++ b/src/ops/add/op.cpp
@@ -4,6 +4,9 @@
 #include "../../utils.hpp"
 
 #include "cpu/add_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/add_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void add(tensor_t c, tensor_t a, tensor_t b) {
@@ -25,8 +28,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) {
         return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu
new file mode 100644
index 000000000..bc426917c
--- /dev/null
+++ b/src/ops/argmax/nvidia/argmax_nvidia.cu
@@ -0,0 +1,77 @@
+#include "argmax_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+#include <math_constants.h>
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void argmax_kernel(int64_t *max_idx, T *max_val, const T *vals, size_t numel) {
+    constexpr int kBlockSize = 256;
+    __shared__ float s_val[kBlockSize];
+    __shared__ int64_t s_idx[kBlockSize];
+
+    int tid = threadIdx.x;
+
+    float local_val = -CUDART_INF_F;
+    int64_t local_idx = 0;
+    bool has = false;
+
+    for (size_t i = static_cast<size_t>(tid); i < numel; i += blockDim.x) {
+        float v = to_float(vals[i]);
+        if (!has || v > local_val || (v == local_val && static_cast<int64_t>(i) < local_idx)) {
+            local_val = v;
+            local_idx = static_cast<int64_t>(i);
+            has = true;
+        }
+    }
+
+    s_val[tid] = local_val;
+    s_idx[tid] = local_idx;
+    __syncthreads();
+
+    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            float rhs_val = s_val[tid + stride];
+            int64_t rhs_idx = s_idx[tid + stride];
+            if (rhs_val > s_val[tid] || (rhs_val == s_val[tid] && rhs_idx < s_idx[tid])) {
+                s_val[tid] = rhs_val;
+                s_idx[tid] = rhs_idx;
+            }
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        *max_idx = s_idx[0];
+        *max_val = from_float<T>(s_val[0]);
+    }
+}
+
+template <typename T>
+void launch_argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t numel) {
+    constexpr int kBlockSize = 256;
+    auto stream = current_stream();
+    argmax_kernel<<<1, kBlockSize, 0, stream>>>(
+        reinterpret_cast<int64_t *>(max_idx),
+        reinterpret_cast<T *>(max_val),
+        reinterpret_cast<const T *>(vals),
+        numel);
+    check_cuda(cudaGetLastError(), "argmax_kernel launch");
+}
+
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_argmax<float>(max_idx, max_val, vals, numel);
+    case LLAISYS_DTYPE_F16:
+        return launch_argmax<llaisys::fp16_t>(max_idx, max_val, vals, numel);
+    case LLAISYS_DTYPE_BF16:
+        return launch_argmax<llaisys::bf16_t>(max_idx, max_val, vals, numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/argmax/nvidia/argmax_nvidia.hpp b/src/ops/argmax/nvidia/argmax_nvidia.hpp
new file mode 100644
index 000000000..054fa353c
--- /dev/null
+++ b/src/ops/argmax/nvidia/argmax_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 8cafa32de..ac962f269 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/argmax_cpu.hpp" // 记得包含刚才写的新头文件
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/argmax_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
@@ -15,7 +18,17 @@ void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
                            vals->dtype(), vals->numel());
     }
 
-    // 如果有其他设备，可以在这里继续 switch...
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+
+    switch (vals->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
-} // namespace llaisys::ops
\ No newline at end of file
+} // namespace llaisys::ops
diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu
new file mode 100644
index 000000000..5ea8eeb55
--- /dev/null
+++ b/src/ops/embedding/nvidia/embedding_nvidia.cu
@@ -0,0 +1,65 @@
+#include "embedding_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void embedding_kernel(
+    T *out,
+    const int64_t *index,
+    const T *weight,
+    size_t num_indices,
+    size_t embedding_dim) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = num_indices * embedding_dim;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t row = idx / embedding_dim;
+    size_t col = idx % embedding_dim;
+    int64_t src_row = index[row];
+    out[idx] = weight[static_cast<size_t>(src_row) * embedding_dim + col];
+}
+
+template <typename T>
+void launch_embedding(
+    std::byte *out,
+    const std::byte *index,
+    const std::byte *weight,
+    size_t num_indices,
+    size_t embedding_dim) {
+    const size_t total = num_indices * embedding_dim;
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(total, threads);
+    auto stream = current_stream();
+    embedding_kernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const int64_t *>(index),
+        reinterpret_cast<const T *>(weight),
+        num_indices,
+        embedding_dim);
+    check_cuda(cudaGetLastError(), "embedding_kernel launch");
+}
+
+void embedding(
+    std::byte *out,
+    const std::byte *index,
+    const std::byte *weight,
+    llaisysDataType_t type,
+    size_t num_indices,
+    size_t embedding_dim) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_embedding<float>(out, index, weight, num_indices, embedding_dim);
+    case LLAISYS_DTYPE_F16:
+        return launch_embedding<llaisys::fp16_t>(out, index, weight, num_indices, embedding_dim);
+    case LLAISYS_DTYPE_BF16:
+        return launch_embedding<llaisys::bf16_t>(out, index, weight, num_indices, embedding_dim);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/embedding/nvidia/embedding_nvidia.hpp b/src/ops/embedding/nvidia/embedding_nvidia.hpp
new file mode 100644
index 000000000..c77e9d152
--- /dev/null
+++ b/src/ops/embedding/nvidia/embedding_nvidia.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void embedding(
+    std::byte *out,
+    const std::byte *index,
+    const std::byte *weight,
+    llaisysDataType_t type,
+    size_t num_indices,
+    size_t embedding_dim);
+}
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 1008838f0..42b2b136d 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/embedding_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/embedding_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
@@ -25,6 +28,17 @@ void embedding(tensor_t out, tensor_t index, tensor_t weight) {
                               out->dtype(), num_indices, embedding_dim);
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), num_indices, embedding_dim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::embedding(out->data(), index->data(), weight->data(), out->dtype(), num_indices, embedding_dim);
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
-} // namespace llaisys::ops
\ No newline at end of file
+} // namespace llaisys::ops
diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu
new file mode 100644
index 000000000..481570e74
--- /dev/null
+++ b/src/ops/linear/nvidia/linear_nvidia.cu
@@ -0,0 +1,139 @@
+#include "linear_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+#include <cublas_v2.h>
+
+namespace llaisys::ops::nvidia {
+
+inline void check_cublas(cublasStatus_t status, const char *msg) {
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        std::cerr << "[CUBLAS ERROR] " << msg << ": " << static_cast<int>(status) << std::endl;
+        throw std::runtime_error("cuBLAS API failed");
+    }
+}
+
+static thread_local cublasHandle_t TL_HANDLE = nullptr;
+
+cublasHandle_t get_cublas_handle() {
+    if (TL_HANDLE == nullptr) {
+        check_cublas(cublasCreate(&TL_HANDLE), "cublasCreate");
+    }
+    return TL_HANDLE;
+}
+
+template <typename T>
+__global__ void add_bias_kernel(T *out, const T *bias, size_t M, size_t N) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = M * N;
+    if (idx >= total) {
+        return;
+    }
+    size_t col = idx % N;
+    out[idx] = from_float<T>(to_float(out[idx]) + to_float(bias[col]));
+}
+
+void launch_bias(
+    std::byte *out,
+    const std::byte *bias,
+    llaisysDataType_t type,
+    size_t M,
+    size_t N,
+    cudaStream_t stream) {
+    const size_t total = M * N;
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(total, threads);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        add_bias_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(bias),
+            M,
+            N);
+        break;
+    case LLAISYS_DTYPE_F16:
+        add_bias_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(bias),
+            M,
+            N);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        add_bias_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(bias),
+            M,
+            N);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    check_cuda(cudaGetLastError(), "linear add_bias kernel launch");
+}
+
+void linear(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *weight,
+    const std::byte *bias,
+    llaisysDataType_t type,
+    size_t M,
+    size_t N,
+    size_t K) {
+    // row-major: out[M, N] = in[M, K] * weight[N, K]^T
+    // map to column-major GEMM:
+    // out_col[N, M] = op(weight_col[K, N]) * in_col[K, M]
+    // where op = transpose, so result is equivalent to row-major formula.
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    cudaDataType_t data_type;
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        data_type = CUDA_R_32F;
+        break;
+    case LLAISYS_DTYPE_F16:
+        data_type = CUDA_R_16F;
+        break;
+    case LLAISYS_DTYPE_BF16:
+        data_type = CUDA_R_16BF;
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    auto stream = current_stream();
+    cublasHandle_t handle = get_cublas_handle();
+    check_cublas(cublasSetStream(handle, stream), "cublasSetStream");
+
+    check_cublas(
+        cublasGemmEx(
+            handle,
+            CUBLAS_OP_T,
+            CUBLAS_OP_N,
+            static_cast<int>(N),
+            static_cast<int>(M),
+            static_cast<int>(K),
+            &alpha,
+            weight,
+            data_type,
+            static_cast<int>(K),
+            in,
+            data_type,
+            static_cast<int>(K),
+            &beta,
+            out,
+            data_type,
+            static_cast<int>(N),
+            CUBLAS_COMPUTE_32F,
+            CUBLAS_GEMM_DEFAULT_TENSOR_OP),
+        "cublasGemmEx");
+
+    if (bias != nullptr) {
+        launch_bias(out, bias, type, M, N, stream);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/linear/nvidia/linear_nvidia.hpp b/src/ops/linear/nvidia/linear_nvidia.hpp
new file mode 100644
index 000000000..72f5440b2
--- /dev/null
+++ b/src/ops/linear/nvidia/linear_nvidia.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void linear(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *weight,
+    const std::byte *bias,
+    llaisysDataType_t type,
+    size_t M,
+    size_t N,
+    size_t K);
+}
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 5222917c3..487e2deb2 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/linear_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/linear_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
@@ -27,6 +30,18 @@ void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
                            out->dtype(), M, N, K);
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+    const std::byte *bias_ptr = bias ? bias->data() : nullptr;
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::linear(out->data(), in->data(), weight->data(), bias_ptr, out->dtype(), M, N, K);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::linear(out->data(), in->data(), weight->data(), bias_ptr, out->dtype(), M, N, K);
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
-} // namespace llaisys::ops
\ No newline at end of file
+} // namespace llaisys::ops
diff --git a/src/ops/nvidia_cuda.cuh b/src/ops/nvidia_cuda.cuh
new file mode 100644
index 000000000..4c41bb544
--- /dev/null
+++ b/src/ops/nvidia_cuda.cuh
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "../core/llaisys_core.hpp"
+#include "../utils.hpp"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+
+namespace llaisys::ops::nvidia {
+
+inline void check_cuda(cudaError_t err, const char *msg) {
+    if (err != cudaSuccess) {
+        std::cerr << "[CUDA ERROR] " << msg << ": " << cudaGetErrorString(err) << std::endl;
+        throw std::runtime_error("CUDA API failed");
+    }
+}
+
+inline cudaStream_t current_stream() {
+    return reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+}
+
+template <typename T>
+__device__ inline float to_float(T v);
+
+template <>
+__device__ inline float to_float<float>(float v) {
+    return v;
+}
+
+template <>
+__device__ inline float to_float<llaisys::fp16_t>(llaisys::fp16_t v) {
+    return __half2float(__ushort_as_half(v._v));
+}
+
+template <>
+__device__ inline float to_float<llaisys::bf16_t>(llaisys::bf16_t v) {
+    return __bfloat162float(__ushort_as_bfloat16(v._v));
+}
+
+template <typename T>
+__device__ inline T from_float(float v);
+
+template <>
+__device__ inline float from_float<float>(float v) {
+    return v;
+}
+
+template <>
+__device__ inline llaisys::fp16_t from_float<llaisys::fp16_t>(float v) {
+    llaisys::fp16_t out;
+    out._v = __half_as_ushort(__float2half_rn(v));
+    return out;
+}
+
+template <>
+__device__ inline llaisys::bf16_t from_float<llaisys::bf16_t>(float v) {
+    llaisys::bf16_t out;
+    out._v = __bfloat16_as_ushort(__float2bfloat16(v));
+    return out;
+}
+
+inline int num_threads_1d() {
+    return 256;
+}
+
+inline int num_blocks_1d(size_t n, int threads = 256) {
+    return static_cast<int>((n + threads - 1) / threads);
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/ops/rearrange/nvidia/rearrange_nvidia.cu
new file mode 100644
index 000000000..e332702be
--- /dev/null
+++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cu
@@ -0,0 +1,108 @@
+#include "rearrange_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+#include <vector>
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void rearrange_kernel(
+    T *out,
+    const T *in,
+    const int64_t *shape,
+    const int64_t *in_strides,
+    const int64_t *out_strides,
+    int rank,
+    size_t total) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t t = idx;
+    int64_t in_off = 0;
+    int64_t out_off = 0;
+
+    for (int d = rank - 1; d >= 0; --d) {
+        int64_t cur = static_cast<int64_t>(t % static_cast<size_t>(shape[d]));
+        t /= static_cast<size_t>(shape[d]);
+        in_off += cur * in_strides[d];
+        out_off += cur * out_strides[d];
+    }
+
+    out[static_cast<size_t>(out_off)] = in[static_cast<size_t>(in_off)];
+}
+
+template <typename T>
+void launch_rearrange(
+    std::byte *out,
+    const std::byte *in,
+    const std::vector<size_t> &shape,
+    const std::vector<ptrdiff_t> &in_strides,
+    const std::vector<ptrdiff_t> &out_strides) {
+    const int rank = static_cast<int>(shape.size());
+    std::vector<int64_t> h_shape(rank);
+    std::vector<int64_t> h_in_strides(rank);
+    std::vector<int64_t> h_out_strides(rank);
+
+    size_t total = 1;
+    for (int i = 0; i < rank; ++i) {
+        h_shape[i] = static_cast<int64_t>(shape[i]);
+        h_in_strides[i] = static_cast<int64_t>(in_strides[i]);
+        h_out_strides[i] = static_cast<int64_t>(out_strides[i]);
+        total *= shape[i];
+    }
+
+    int64_t *d_shape = nullptr;
+    int64_t *d_in_strides = nullptr;
+    int64_t *d_out_strides = nullptr;
+
+    const size_t meta_bytes = static_cast<size_t>(rank) * sizeof(int64_t);
+    auto stream = current_stream();
+
+    check_cuda(cudaMalloc(&d_shape, meta_bytes), "cudaMalloc d_shape");
+    check_cuda(cudaMalloc(&d_in_strides, meta_bytes), "cudaMalloc d_in_strides");
+    check_cuda(cudaMalloc(&d_out_strides, meta_bytes), "cudaMalloc d_out_strides");
+
+    check_cuda(cudaMemcpyAsync(d_shape, h_shape.data(), meta_bytes, cudaMemcpyHostToDevice, stream), "copy shape");
+    check_cuda(cudaMemcpyAsync(d_in_strides, h_in_strides.data(), meta_bytes, cudaMemcpyHostToDevice, stream), "copy in_strides");
+    check_cuda(cudaMemcpyAsync(d_out_strides, h_out_strides.data(), meta_bytes, cudaMemcpyHostToDevice, stream), "copy out_strides");
+
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(total, threads);
+    rearrange_kernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const T *>(in),
+        d_shape,
+        d_in_strides,
+        d_out_strides,
+        rank,
+        total);
+    check_cuda(cudaGetLastError(), "rearrange_kernel launch");
+
+    check_cuda(cudaFree(d_shape), "cudaFree d_shape");
+    check_cuda(cudaFree(d_in_strides), "cudaFree d_in_strides");
+    check_cuda(cudaFree(d_out_strides), "cudaFree d_out_strides");
+}
+
+void rearrange(
+    std::byte *out,
+    const std::byte *in,
+    llaisysDataType_t type,
+    const std::vector<size_t> &shape,
+    const std::vector<ptrdiff_t> &in_strides,
+    const std::vector<ptrdiff_t> &out_strides) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_rearrange<float>(out, in, shape, in_strides, out_strides);
+    case LLAISYS_DTYPE_F16:
+        return launch_rearrange<llaisys::fp16_t>(out, in, shape, in_strides, out_strides);
+    case LLAISYS_DTYPE_BF16:
+        return launch_rearrange<llaisys::bf16_t>(out, in, shape, in_strides, out_strides);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.hpp b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp
new file mode 100644
index 000000000..6784a8b96
--- /dev/null
+++ b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace llaisys::ops::nvidia {
+void rearrange(
+    std::byte *out,
+    const std::byte *in,
+    llaisysDataType_t type,
+    const std::vector<size_t> &shape,
+    const std::vector<ptrdiff_t> &in_strides,
+    const std::vector<ptrdiff_t> &out_strides);
+}
diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp
index 5542d1345..8a9431e30 100644
--- a/src/ops/rearrange/op.cpp
+++ b/src/ops/rearrange/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/rearrange_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/rearrange_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void rearrange(tensor_t out, tensor_t in) {
@@ -15,6 +18,17 @@ void rearrange(tensor_t out, tensor_t in) {
                               in->shape(), in->strides(), out->strides());
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rearrange(out->data(), in->data(), out->dtype(), in->shape(), in->strides(), out->strides());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::rearrange(out->data(), in->data(), out->dtype(), in->shape(), in->strides(), out->strides());
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
 }
-}
\ No newline at end of file
diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
new file mode 100644
index 000000000..e5d35c0c5
--- /dev/null
+++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
@@ -0,0 +1,85 @@
+#include "rms_norm_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void rms_norm_kernel(T *out, const T *in, const T *weight, size_t M, size_t d, float eps) {
+    size_t row = blockIdx.x;
+    if (row >= M) {
+        return;
+    }
+
+    extern __shared__ float shm[];
+    float *s_sum = shm;
+
+    const T *row_in = in + row * d;
+    T *row_out = out + row * d;
+
+    float local_sum = 0.0f;
+    for (size_t j = threadIdx.x; j < d; j += blockDim.x) {
+        float x = to_float(row_in[j]);
+        local_sum += x * x;
+    }
+
+    s_sum[threadIdx.x] = local_sum;
+    __syncthreads();
+
+    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) {
+            s_sum[threadIdx.x] += s_sum[threadIdx.x + stride];
+        }
+        __syncthreads();
+    }
+
+    float inv_rms = rsqrtf(s_sum[0] / static_cast<float>(d) + eps);
+    for (size_t j = threadIdx.x; j < d; j += blockDim.x) {
+        float x = to_float(row_in[j]);
+        float w = to_float(weight[j]);
+        row_out[j] = from_float<T>(x * w * inv_rms);
+    }
+}
+
+template <typename T>
+void launch_rms_norm(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *weight,
+    size_t M,
+    size_t d,
+    float eps) {
+    const int threads = 256;
+    const size_t shared_bytes = static_cast<size_t>(threads) * sizeof(float);
+    auto stream = current_stream();
+    rms_norm_kernel<<<static_cast<int>(M), threads, shared_bytes, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const T *>(in),
+        reinterpret_cast<const T *>(weight),
+        M,
+        d,
+        eps);
+    check_cuda(cudaGetLastError(), "rms_norm_kernel launch");
+}
+
+void rms_norm(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *weight,
+    llaisysDataType_t type,
+    size_t M,
+    size_t d,
+    float eps) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_rms_norm<float>(out, in, weight, M, d, eps);
+    case LLAISYS_DTYPE_F16:
+        return launch_rms_norm<llaisys::fp16_t>(out, in, weight, M, d, eps);
+    case LLAISYS_DTYPE_BF16:
+        return launch_rms_norm<llaisys::bf16_t>(out, in, weight, M, d, eps);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp
new file mode 100644
index 000000000..8892a1dab
--- /dev/null
+++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rms_norm(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *weight,
+    llaisysDataType_t type,
+    size_t M,
+    size_t d,
+    float eps);
+}
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 9b0005857..a971b3ee1 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/rms_norm_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/rms_norm_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
@@ -22,6 +25,17 @@ void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
                              out->dtype(), M, d, eps);
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), M, d, eps);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), M, d, eps);
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
-} // namespace llaisys::ops
\ No newline at end of file
+} // namespace llaisys::ops
diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu
new file mode 100644
index 000000000..f6ecbb60d
--- /dev/null
+++ b/src/ops/rope/nvidia/rope_nvidia.cu
@@ -0,0 +1,92 @@
+#include "rope_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void rope_kernel(
+    T *out,
+    const T *in,
+    const int64_t *pos_ids,
+    size_t seqlen,
+    size_t nhead,
+    size_t d,
+    float theta) {
+    size_t half_d = d / 2;
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = seqlen * nhead * half_d;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t j = idx % half_d;
+    size_t tmp = idx / half_d;
+    size_t h = tmp % nhead;
+    size_t i = tmp / nhead;
+
+    double pos = static_cast<double>(pos_ids[i]);
+    double exponent = (2.0 * static_cast<double>(j)) / static_cast<double>(d);
+    double angle = pos / pow(static_cast<double>(theta), exponent);
+    double c = cos(angle);
+    double s = sin(angle);
+
+    size_t base = (i * nhead + h) * d;
+    size_t idx_a = base + j;
+    size_t idx_b = base + j + half_d;
+
+    double a = static_cast<double>(to_float(in[idx_a]));
+    double b = static_cast<double>(to_float(in[idx_b]));
+
+    out[idx_a] = from_float<T>(static_cast<float>(a * c - b * s));
+    out[idx_b] = from_float<T>(static_cast<float>(b * c + a * s));
+}
+
+template <typename T>
+void launch_rope(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *pos_ids,
+    size_t seqlen,
+    size_t nhead,
+    size_t d,
+    float theta) {
+    size_t total = seqlen * nhead * (d / 2);
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(total, threads);
+    auto stream = current_stream();
+    rope_kernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const T *>(in),
+        reinterpret_cast<const int64_t *>(pos_ids),
+        seqlen,
+        nhead,
+        d,
+        theta);
+    check_cuda(cudaGetLastError(), "rope_kernel launch");
+}
+
+void rope(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *pos_ids,
+    llaisysDataType_t type,
+    size_t seqlen,
+    size_t nhead,
+    size_t d,
+    float theta) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_rope<float>(out, in, pos_ids, seqlen, nhead, d, theta);
+    case LLAISYS_DTYPE_F16:
+        return launch_rope<llaisys::fp16_t>(out, in, pos_ids, seqlen, nhead, d, theta);
+    case LLAISYS_DTYPE_BF16:
+        return launch_rope<llaisys::bf16_t>(out, in, pos_ids, seqlen, nhead, d, theta);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/rope/nvidia/rope_nvidia.hpp b/src/ops/rope/nvidia/rope_nvidia.hpp
new file mode 100644
index 000000000..a93fa9159
--- /dev/null
+++ b/src/ops/rope/nvidia/rope_nvidia.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rope(
+    std::byte *out,
+    const std::byte *in,
+    const std::byte *pos_ids,
+    llaisysDataType_t type,
+    size_t seqlen,
+    size_t nhead,
+    size_t d,
+    float theta);
+}
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index b56241b59..d69b21331 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/rope_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/rope_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
@@ -24,6 +27,17 @@ void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
                          out->dtype(), seqlen, nhead, d, theta);
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rope(out->data(), in->data(), pos_ids->data(), out->dtype(), seqlen, nhead, d, theta);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::rope(out->data(), in->data(), pos_ids->data(), out->dtype(), seqlen, nhead, d, theta);
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
-} // namespace llaisys::ops
\ No newline at end of file
+} // namespace llaisys::ops
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
new file mode 100644
index 000000000..22a525f17
--- /dev/null
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
@@ -0,0 +1,149 @@
+#include "self_attention_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void self_attention_kernel(
+    T *attn_val,
+    const T *q,
+    const T *k,
+    const T *v,
+    size_t seqlen,
+    size_t total_len,
+    size_t nhead,
+    size_t nkvhead,
+    size_t d,
+    size_t dv,
+    float scale) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = seqlen * nhead * dv;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t cdv = idx % dv;
+    size_t t = idx / dv;
+    size_t h = t % nhead;
+    size_t i = t / nhead;
+
+    size_t n_groups = nhead / nkvhead;
+    size_t h_kv = h / n_groups;
+
+    ptrdiff_t cutoff = static_cast<ptrdiff_t>(total_len) - static_cast<ptrdiff_t>(seqlen) + static_cast<ptrdiff_t>(i);
+
+    float max_score = -INFINITY;
+    for (size_t j = 0; j < total_len; ++j) {
+        if (static_cast<ptrdiff_t>(j) > cutoff) {
+            continue;
+        }
+
+        float dot = 0.0f;
+        size_t q_base = (i * nhead + h) * d;
+        size_t k_base = (j * nkvhead + h_kv) * d;
+        for (size_t c = 0; c < d; ++c) {
+            dot += to_float(q[q_base + c]) * to_float(k[k_base + c]);
+        }
+        float score = dot * scale;
+        max_score = fmaxf(max_score, score);
+    }
+
+    float sum_exp = 0.0f;
+    for (size_t j = 0; j < total_len; ++j) {
+        if (static_cast<ptrdiff_t>(j) > cutoff) {
+            continue;
+        }
+
+        float dot = 0.0f;
+        size_t q_base = (i * nhead + h) * d;
+        size_t k_base = (j * nkvhead + h_kv) * d;
+        for (size_t c = 0; c < d; ++c) {
+            dot += to_float(q[q_base + c]) * to_float(k[k_base + c]);
+        }
+        float score = dot * scale;
+        sum_exp += expf(score - max_score);
+    }
+
+    float out_val = 0.0f;
+    for (size_t j = 0; j < total_len; ++j) {
+        if (static_cast<ptrdiff_t>(j) > cutoff) {
+            continue;
+        }
+
+        float dot = 0.0f;
+        size_t q_base = (i * nhead + h) * d;
+        size_t k_base = (j * nkvhead + h_kv) * d;
+        for (size_t c = 0; c < d; ++c) {
+            dot += to_float(q[q_base + c]) * to_float(k[k_base + c]);
+        }
+        float score = dot * scale;
+        float prob = expf(score - max_score) / sum_exp;
+
+        size_t v_idx = (j * nkvhead + h_kv) * dv + cdv;
+        out_val += prob * to_float(v[v_idx]);
+    }
+
+    attn_val[idx] = from_float<T>(out_val);
+}
+
+template <typename T>
+void launch_self_attention(
+    std::byte *attn_val,
+    const std::byte *q,
+    const std::byte *k,
+    const std::byte *v,
+    size_t seqlen,
+    size_t total_len,
+    size_t nhead,
+    size_t nkvhead,
+    size_t d,
+    size_t dv,
+    float scale) {
+    size_t total = seqlen * nhead * dv;
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(total, threads);
+    auto stream = current_stream();
+    self_attention_kernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(attn_val),
+        reinterpret_cast<const T *>(q),
+        reinterpret_cast<const T *>(k),
+        reinterpret_cast<const T *>(v),
+        seqlen,
+        total_len,
+        nhead,
+        nkvhead,
+        d,
+        dv,
+        scale);
+    check_cuda(cudaGetLastError(), "self_attention_kernel launch");
+}
+
+void self_attention(
+    std::byte *attn_val,
+    const std::byte *q,
+    const std::byte *k,
+    const std::byte *v,
+    llaisysDataType_t type,
+    size_t seqlen,
+    size_t total_len,
+    size_t nhead,
+    size_t nkvhead,
+    size_t d,
+    size_t dv,
+    float scale) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_self_attention<float>(attn_val, q, k, v, seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    case LLAISYS_DTYPE_F16:
+        return launch_self_attention<llaisys::fp16_t>(attn_val, q, k, v, seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    case LLAISYS_DTYPE_BF16:
+        return launch_self_attention<llaisys::bf16_t>(attn_val, q, k, v, seqlen, total_len, nhead, nkvhead, d, dv, scale);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.hpp b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp
new file mode 100644
index 000000000..ec7139b4b
--- /dev/null
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void self_attention(
+    std::byte *attn_val,
+    const std::byte *q,
+    const std::byte *k,
+    const std::byte *v,
+    llaisysDataType_t type,
+    size_t seqlen,
+    size_t total_len,
+    size_t nhead,
+    size_t nkvhead,
+    size_t d,
+    size_t dv,
+    float scale);
+}
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index b139964de..e98802461 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/self_attention_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/self_attention_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
@@ -24,6 +27,19 @@ void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float
                                    attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale);
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId());
+
+    switch (attn_val->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                   attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                      attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale);
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
 }
-}
\ No newline at end of file
diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu
new file mode 100644
index 000000000..a1f0f924b
--- /dev/null
+++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu
@@ -0,0 +1,53 @@
+#include "swiglu_nvidia.hpp"
+
+#include "../../nvidia_cuda.cuh"
+
+#include <cmath>
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void swiglu_kernel(T *out, const T *gate, const T *up, size_t numel) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+
+    const float g = to_float(gate[idx]);
+    const float u = to_float(up[idx]);
+    const float silu = g / (1.0f + expf(-g));
+    out[idx] = from_float<T>(u * silu);
+}
+
+template <typename T>
+void launch_swiglu(std::byte *out, const std::byte *gate, const std::byte *up, size_t numel) {
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(numel, threads);
+    auto stream = current_stream();
+    swiglu_kernel<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<T *>(out),
+        reinterpret_cast<const T *>(gate),
+        reinterpret_cast<const T *>(up),
+        numel);
+    check_cuda(cudaGetLastError(), "swiglu_kernel launch");
+}
+
+void swiglu(
+    std::byte *out,
+    const std::byte *gate,
+    const std::byte *up,
+    llaisysDataType_t type,
+    size_t num_elements) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return launch_swiglu<float>(out, gate, up, num_elements);
+    case LLAISYS_DTYPE_F16:
+        return launch_swiglu<llaisys::fp16_t>(out, gate, up, num_elements);
+    case LLAISYS_DTYPE_BF16:
+        return launch_swiglu<llaisys::bf16_t>(out, gate, up, num_elements);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.hpp b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp
new file mode 100644
index 000000000..468a79d5c
--- /dev/null
+++ b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void swiglu(
+    std::byte *out,
+    const std::byte *gate,
+    const std::byte *up,
+    llaisysDataType_t type,
+    size_t num_elements);
+}
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 7f224b214..f8b07dfbf 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -2,6 +2,9 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/swiglu_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/swiglu_nvidia.hpp"
+#endif
 
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
@@ -24,6 +27,17 @@ void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
                            out->dtype(), num_elements);
     }
 
-    EXCEPTION_UNSUPPORTED_DEVICE;
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), num_elements);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), num_elements);
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
 }
-}
\ No newline at end of file
diff --git a/test/ops/self_attention.py b/test/ops/self_attention.py
index a042b51be..abf3927a8 100644
--- a/test/ops/self_attention.py
+++ b/test/ops/self_attention.py
@@ -15,7 +15,7 @@ def torch_self_attention(attn_val, query, key, value, scale):
     L, S = query.size(-2), key.size(-2)
     attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
 
-    temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=S-L)
+    temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).tril(diagonal=S-L)
     attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
     attn_bias.to(query.dtype)
 
diff --git a/xmake.lua b/xmake.lua
index 095d96875..f63954ef9 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -37,6 +37,9 @@ target("llaisys-device")
     set_kind("static")
     add_deps("llaisys-utils")
     add_deps("llaisys-device-cpu")
+    if has_config("nv-gpu") then
+        add_deps("llaisys-device-nvidia")
+    end
 
     set_languages("cxx17")
     set_warnings("all", "error")
@@ -83,6 +86,9 @@ target_end()
 target("llaisys-ops")
     set_kind("static")
     add_deps("llaisys-ops-cpu")
+    if has_config("nv-gpu") then
+        add_deps("llaisys-ops-nvidia")
+    end
 
     set_languages("cxx17")
     set_warnings("all", "error")
@@ -120,4 +126,4 @@ target("llaisys")
             os.cp("lib/*.so", "python/llaisys/libllaisys/")
         end
     end)
-target_end()
\ No newline at end of file
+target_end()
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
new file mode 100644
index 000000000..bbd2e13c3
--- /dev/null
+++ b/xmake/nvidia.lua
@@ -0,0 +1,36 @@
+target("llaisys-device-nvidia")
+    set_kind("static")
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    add_includedirs("/usr/local/cuda/include")
+    add_linkdirs("/usr/local/cuda/lib64")
+    add_links("cudart")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+    end
+
+    add_files("../src/device/nvidia/*.cpp")
+
+    on_install(function (target) end)
+target_end()
+
+target("llaisys-ops-nvidia")
+    set_kind("static")
+    add_deps("llaisys-tensor")
+
+    add_rules("cuda")
+    set_values("cuda.rdc", false)
+
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    add_includedirs("/usr/local/cuda/include")
+    add_linkdirs("/usr/local/cuda/lib64")
+    add_links("cudart", "cublas")
+    if not is_plat("windows") then
+        add_cuflags("-Xcompiler=-fPIC")
+    end
+
+    add_files("../src/ops/*/nvidia/*.cu")
+
+    on_install(function (target) end)
+target_end()

From 9cb8bb3e9c7df205bdcc1a6cddb77c2af1941e8e Mon Sep 17 00:00:00 2001
From: mengyijia49 <mengyijia49@gmail.com>
Date: Mon, 16 Mar 2026 15:48:11 +0800
Subject: [PATCH 7/9] Add random sampling and minimal chat server/CLI with
 NVIDIA build support

---
 include/llaisys/models/qwen2.h                |   8 +
 include/llaisys/ops.h                         |   7 +
 python/llaisys/chat/__init__.py               |   3 +
 python/llaisys/chat/cli.py                    | 137 ++++++++++
 python/llaisys/chat/engine.py                 | 112 +++++++++
 python/llaisys/chat/server.py                 | 189 ++++++++++++++
 python/llaisys/libllaisys/ops.py              |  12 +-
 python/llaisys/libllaisys/qwen2.py            |  16 ++
 python/llaisys/models/qwen2.py                |  36 ++-
 python/llaisys/ops.py                         |  20 +-
 src/llaisys/ops.cc                            |  16 ++
 src/llaisys/qwen2.cc                          |  65 ++++-
 src/ops/nvidia_cuda.cuh                       |   7 +-
 .../random_sample/cpu/random_sample_cpu.cpp   | 186 ++++++++++++++
 .../random_sample/cpu/random_sample_cpu.hpp   |  18 ++
 .../nvidia/random_sample_nvidia.cu            | 236 ++++++++++++++++++
 .../nvidia/random_sample_nvidia.hpp           |  18 ++
 src/ops/random_sample/op.cpp                  |  83 ++++++
 src/ops/random_sample/op.hpp                  |  16 ++
 xmake/nvidia.lua                              |  36 ++-
 20 files changed, 1197 insertions(+), 24 deletions(-)
 create mode 100644 python/llaisys/chat/__init__.py
 create mode 100644 python/llaisys/chat/cli.py
 create mode 100644 python/llaisys/chat/engine.py
 create mode 100644 python/llaisys/chat/server.py
 create mode 100644 src/ops/random_sample/cpu/random_sample_cpu.cpp
 create mode 100644 src/ops/random_sample/cpu/random_sample_cpu.hpp
 create mode 100644 src/ops/random_sample/nvidia/random_sample_nvidia.cu
 create mode 100644 src/ops/random_sample/nvidia/random_sample_nvidia.hpp
 create mode 100644 src/ops/random_sample/op.cpp
 create mode 100644 src/ops/random_sample/op.hpp

diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 296f0f15f..04f043a2f 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -47,6 +47,14 @@ __C {
     __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name);
 
     __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    __export int64_t llaisysQwen2ModelInferSample(
+        struct LlaisysQwen2Model * model,
+        int64_t * token_ids,
+        size_t ntoken,
+        float temperature,
+        size_t top_k,
+        float top_p,
+        uint64_t seed);
 
     // KV cache APIs
     __export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens);
diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h
index ddb3be246..dd46a18b5 100644
--- a/include/llaisys/ops.h
+++ b/include/llaisys/ops.h
@@ -6,6 +6,13 @@
 __C {
     __export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b);
     __export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals);
+    __export void llaisysRandomSample(
+        llaisysTensor_t out_idx,
+        llaisysTensor_t logits,
+        float temperature,
+        size_t top_k,
+        float top_p,
+        uint64_t seed);
     __export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight);
     __export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias);
     __export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in);
diff --git a/python/llaisys/chat/__init__.py b/python/llaisys/chat/__init__.py
new file mode 100644
index 000000000..ee6c0897e
--- /dev/null
+++ b/python/llaisys/chat/__init__.py
@@ -0,0 +1,3 @@
+from .engine import ChatEngine
+
+__all__ = ["ChatEngine"]
diff --git a/python/llaisys/chat/cli.py b/python/llaisys/chat/cli.py
new file mode 100644
index 000000000..de7f27a64
--- /dev/null
+++ b/python/llaisys/chat/cli.py
@@ -0,0 +1,137 @@
+import argparse
+import json
+import sys
+import urllib.error
+import urllib.request
+from typing import Any, Dict, List, Optional
+
+
+def _post_json(url: str, payload: Dict[str, Any], stream: bool):
+    body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "text/event-stream" if stream else "application/json",
+    }
+    req = urllib.request.Request(url, data=body, headers=headers, method="POST")
+    return urllib.request.urlopen(req, timeout=600)
+
+
+def _send_chat(
+    endpoint: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    max_new_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    seed: Optional[int],
+    stream: bool,
+) -> str:
+    payload = {
+        "model": model,
+        "messages": messages,
+        "max_new_tokens": int(max_new_tokens),
+        "temperature": float(temperature),
+        "top_k": int(top_k),
+        "top_p": float(top_p),
+        "stream": bool(stream),
+    }
+    if seed is not None:
+        payload["seed"] = int(seed)
+
+    if not stream:
+        with _post_json(endpoint, payload, stream=False) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+        return data["choices"][0]["message"]["content"]
+
+    assistant_text = ""
+    with _post_json(endpoint, payload, stream=True) as resp:
+        for raw in resp:
+            line = raw.decode("utf-8").strip()
+            if not line or not line.startswith("data:"):
+                continue
+            content = line[5:].strip()
+            if content == "[DONE]":
+                break
+
+            chunk = json.loads(content)
+            choices = chunk.get("choices", [])
+            if not choices:
+                continue
+            delta = choices[0].get("delta", {})
+            piece = delta.get("content", "")
+            if piece:
+                sys.stdout.write(piece)
+                sys.stdout.flush()
+                assistant_text += piece
+    sys.stdout.write("\n")
+    sys.stdout.flush()
+    return assistant_text
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLAISYS interactive chat CLI")
+    parser.add_argument("--server", default="http://127.0.0.1:8000")
+    parser.add_argument("--model", default="qwen2")
+    parser.add_argument("--max-new-tokens", default=256, type=int)
+    parser.add_argument("--temperature", default=0.8, type=float)
+    parser.add_argument("--top-k", default=50, type=int)
+    parser.add_argument("--top-p", default=0.9, type=float)
+    parser.add_argument("--seed", default=None, type=int)
+    parser.add_argument("--system", default=None, type=str)
+    parser.add_argument("--stream", action="store_true")
+    args = parser.parse_args()
+
+    endpoint = args.server.rstrip("/") + "/v1/chat/completions"
+    messages: List[Dict[str, str]] = []
+    if args.system:
+        messages.append({"role": "system", "content": args.system})
+
+    print("Commands: /exit to quit, /reset to clear history.")
+    while True:
+        try:
+            user_text = input("You: ").strip()
+        except EOFError:
+            print()
+            break
+
+        if not user_text:
+            continue
+        if user_text in ("/exit", "/quit"):
+            break
+        if user_text == "/reset":
+            messages = []
+            if args.system:
+                messages.append({"role": "system", "content": args.system})
+            print("History cleared.")
+            continue
+
+        messages.append({"role": "user", "content": user_text})
+
+        try:
+            if args.stream:
+                sys.stdout.write("Assistant: ")
+                sys.stdout.flush()
+            assistant_text = _send_chat(
+                endpoint=endpoint,
+                model=args.model,
+                messages=messages,
+                max_new_tokens=args.max_new_tokens,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                seed=args.seed,
+                stream=args.stream,
+            )
+            if not args.stream:
+                print(f"Assistant: {assistant_text}")
+            messages.append({"role": "assistant", "content": assistant_text})
+        except urllib.error.HTTPError as exc:
+            detail = exc.read().decode("utf-8", errors="replace")
+            print(f"HTTP {exc.code}: {detail}")
+        except Exception as exc:  # pragma: no cover - network/runtime errors
+            print(f"Request failed: {exc}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/llaisys/chat/engine.py b/python/llaisys/chat/engine.py
new file mode 100644
index 000000000..201b952a7
--- /dev/null
+++ b/python/llaisys/chat/engine.py
@@ -0,0 +1,112 @@
+from typing import Any, Dict, Iterable, List, Optional
+
+from transformers import AutoTokenizer
+
+from .. import DeviceType
+from ..models import Qwen2
+
+
+def _normalize_content(content: Any) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        chunks: List[str] = []
+        for item in content:
+            if isinstance(item, dict):
+                if item.get("type") == "text" and isinstance(item.get("text"), str):
+                    chunks.append(item["text"])
+            elif isinstance(item, str):
+                chunks.append(item)
+        return "".join(chunks)
+    return str(content)
+
+
+def _normalize_messages(messages: Iterable[Dict[str, Any]]) -> List[Dict[str, str]]:
+    conversation: List[Dict[str, str]] = []
+    for message in messages:
+        if not isinstance(message, dict):
+            continue
+        role = str(message.get("role", "user"))
+        content = _normalize_content(message.get("content", ""))
+        conversation.append({"role": role, "content": content})
+    return conversation
+
+
+class ChatEngine:
+    def __init__(
+        self,
+        model_path: str,
+        device: str = "cpu",
+        max_new_tokens: int = 256,
+        top_k: int = 50,
+        top_p: float = 0.9,
+        temperature: float = 0.8,
+    ):
+        self.model_path = model_path
+        self.max_new_tokens = int(max_new_tokens)
+        self.top_k = int(top_k)
+        self.top_p = float(top_p)
+        self.temperature = float(temperature)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        if device.lower() == "nvidia":
+            model_device = DeviceType.NVIDIA
+        else:
+            model_device = DeviceType.CPU
+        self.model = Qwen2(model_path, model_device)
+
+    def complete(
+        self,
+        messages: Iterable[Dict[str, Any]],
+        max_new_tokens: Optional[int] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        temperature: Optional[float] = None,
+        seed: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        conversation = _normalize_messages(messages)
+        input_content = self.tokenizer.apply_chat_template(
+            conversation=conversation,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        input_ids = self.tokenizer.encode(input_content)
+
+        out_ids = self.model.generate(
+            input_ids,
+            max_new_tokens=self.max_new_tokens if max_new_tokens is None else int(max_new_tokens),
+            top_k=self.top_k if top_k is None else int(top_k),
+            top_p=self.top_p if top_p is None else float(top_p),
+            temperature=self.temperature if temperature is None else float(temperature),
+            seed=seed,
+        )
+
+        prompt_tokens = len(input_ids)
+        if len(out_ids) >= prompt_tokens:
+            generated_ids = out_ids[prompt_tokens:]
+        else:
+            generated_ids = []
+        assistant_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+
+        return {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": len(generated_ids),
+            "total_tokens": prompt_tokens + len(generated_ids),
+            "output_ids": out_ids,
+            "generated_ids": generated_ids,
+            "assistant_text": assistant_text,
+        }
+
+    def iter_text_deltas(self, generated_ids: Iterable[int]):
+        seen = ""
+        buffer: List[int] = []
+        for token_id in generated_ids:
+            buffer.append(int(token_id))
+            current = self.tokenizer.decode(buffer, skip_special_tokens=True)
+            if current.startswith(seen):
+                delta = current[len(seen):]
+            else:
+                delta = current
+            seen = current
+            if delta:
+                yield delta
diff --git a/python/llaisys/chat/server.py b/python/llaisys/chat/server.py
new file mode 100644
index 000000000..ff1da8044
--- /dev/null
+++ b/python/llaisys/chat/server.py
@@ -0,0 +1,189 @@
+import argparse
+import json
+import threading
+import time
+import uuid
+from typing import Any, Dict, List, Optional
+
+try:
+    from fastapi import FastAPI
+    from fastapi.responses import JSONResponse, StreamingResponse
+    from pydantic import BaseModel
+except Exception as exc:  # pragma: no cover - runtime dependency guard
+    raise RuntimeError(
+        "FastAPI is required for chat server. Install with: pip install fastapi uvicorn"
+    ) from exc
+
+from .engine import ChatEngine
+
+
+def _dump_model(obj: Any) -> Dict[str, Any]:
+    if hasattr(obj, "model_dump"):
+        return obj.model_dump()
+    if hasattr(obj, "dict"):
+        return obj.dict()
+    return dict(obj)
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: Any
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str = "qwen2"
+    messages: List[ChatMessage]
+    max_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
+    temperature: float = 0.8
+    top_p: float = 0.9
+    top_k: int = 50
+    seed: Optional[int] = None
+    stream: bool = False
+
+
+def create_app(
+    model_path: str,
+    device: str = "cpu",
+    max_new_tokens: int = 256,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    temperature: float = 0.8,
+) -> FastAPI:
+    app = FastAPI(title="LLAISYS Chat Server", version="0.1.0")
+    engine = ChatEngine(
+        model_path=model_path,
+        device=device,
+        max_new_tokens=max_new_tokens,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+    )
+    request_lock = threading.Lock()
+
+    @app.get("/healthz")
+    def healthz():
+        return {"status": "ok"}
+
+    @app.post("/v1/chat/completions")
+    def chat_completions(req: ChatCompletionRequest):
+        request_model = req.model
+        request_messages = [_dump_model(m) for m in req.messages]
+        request_max_new_tokens = req.max_new_tokens
+        if request_max_new_tokens is None:
+            request_max_new_tokens = req.max_tokens
+
+        with request_lock:
+            result = engine.complete(
+                messages=request_messages,
+                max_new_tokens=request_max_new_tokens,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                temperature=req.temperature,
+                seed=req.seed,
+            )
+
+        completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+        created = int(time.time())
+
+        if req.stream:
+            generated_ids = result["generated_ids"]
+
+            def event_stream():
+                role_chunk = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": request_model,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "delta": {"role": "assistant"},
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                yield f"data: {json.dumps(role_chunk, ensure_ascii=False)}\n\n"
+
+                for delta in engine.iter_text_deltas(generated_ids):
+                    chunk = {
+                        "id": completion_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": request_model,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "delta": {"content": delta},
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n"
+
+                stop_chunk = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": request_model,
+                    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
+                }
+                yield f"data: {json.dumps(stop_chunk, ensure_ascii=False)}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(event_stream(), media_type="text/event-stream")
+
+        payload = {
+            "id": completion_id,
+            "object": "chat.completion",
+            "created": created,
+            "model": request_model,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": result["assistant_text"]},
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": result["prompt_tokens"],
+                "completion_tokens": result["completion_tokens"],
+                "total_tokens": result["total_tokens"],
+            },
+        }
+        return JSONResponse(payload)
+
+    return app
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLAISYS OpenAI-compatible chat server")
+    parser.add_argument("--model", required=True, help="Path to model directory")
+    parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"])
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", default=8000, type=int)
+    parser.add_argument("--max-new-tokens", default=256, type=int)
+    parser.add_argument("--top-k", default=50, type=int)
+    parser.add_argument("--top-p", default=0.9, type=float)
+    parser.add_argument("--temperature", default=0.8, type=float)
+    args = parser.parse_args()
+
+    app = create_app(
+        model_path=args.model,
+        device=args.device,
+        max_new_tokens=args.max_new_tokens,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        temperature=args.temperature,
+    )
+
+    try:
+        import uvicorn
+    except Exception as exc:  # pragma: no cover - runtime dependency guard
+        raise RuntimeError("uvicorn is required. Install with: pip install uvicorn") from exc
+
+    uvicorn.run(app, host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py
index 5be095eff..0c27b4421 100644
--- a/python/llaisys/libllaisys/ops.py
+++ b/python/llaisys/libllaisys/ops.py
@@ -1,5 +1,5 @@
 from .tensor import llaisysTensor_t
-from ctypes import c_float
+from ctypes import c_float, c_size_t, c_uint64
 
 def load_ops(lib):
     lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
@@ -8,6 +8,16 @@ def load_ops(lib):
     lib.llaisysArgmax.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
     lib.llaisysArgmax.restype = None
 
+    lib.llaisysRandomSample.argtypes = [
+        llaisysTensor_t,
+        llaisysTensor_t,
+        c_float,
+        c_size_t,
+        c_float,
+        c_uint64,
+    ]
+    lib.llaisysRandomSample.restype = None
+
     lib.llaisysEmbedding.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
     lib.llaisysEmbedding.restype = None
 
diff --git a/python/llaisys/libllaisys/qwen2.py b/python/llaisys/libllaisys/qwen2.py
index 0bd15295c..cd803ea18 100644
--- a/python/llaisys/libllaisys/qwen2.py
+++ b/python/llaisys/libllaisys/qwen2.py
@@ -7,6 +7,7 @@
     c_int64,
     c_void_p,
     c_char_p,
+    c_uint64,
 )
 
 from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t
@@ -54,6 +55,21 @@ def load_qwen2(lib):
     else:
         print('[libllaisys.qwen2] Warning: llaisysQwen2ModelInfer not found in shared lib')
 
+    # int64_t llaisysQwen2ModelInferSample(struct LlaisysQwen2Model * model,
+    #                                      int64_t * token_ids, size_t ntoken,
+    #                                      float temperature, size_t top_k, float top_p, uint64_t seed);
+    if hasattr(lib, 'llaisysQwen2ModelInferSample'):
+        lib.llaisysQwen2ModelInferSample.argtypes = [
+            c_void_p,
+            POINTER(c_int64),
+            c_size_t,
+            c_float,
+            c_size_t,
+            c_float,
+            c_uint64,
+        ]
+        lib.llaisysQwen2ModelInferSample.restype = c_int64
+
     # int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor);
     if hasattr(lib, 'llaisysQwen2ModelSetWeight'):
         lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p]
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index aee1fb4b6..6d840ce1b 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -3,6 +3,7 @@
 from .. import Tensor
 from ..libllaisys.qwen2 import LlaisysQwen2Meta
 from pathlib import Path
+import random
 
 import safetensors
 
@@ -136,10 +137,11 @@ def generate(
         top_k: int = 1,
         top_p: float = 0.8,
         temperature: float = 0.8,
+        seed: int = None,
     ):
         if self._backend_model is not None:
             import ctypes
-            from ctypes import c_int64, c_size_t
+            from ctypes import c_float, c_int64, c_size_t, c_uint64
 
             input_ids = [int(t) for t in inputs]
             if not input_ids:
@@ -161,8 +163,36 @@ def generate(
                 self._backend_kv = None
             self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap))
 
+            sample_api_available = hasattr(LIB_LLAISYS, "llaisysQwen2ModelInferSample")
+            sample_top_k = max(0, int(top_k))
+            sample_top_p = float(top_p)
+            sample_temperature = float(temperature)
+            rng = random.Random() if seed is None else random.Random(int(seed))
+
+            def infer_with_sampling(token_buffer, n_token):
+                if sample_api_available:
+                    step_seed = rng.getrandbits(64)
+                    return int(
+                        LIB_LLAISYS.llaisysQwen2ModelInferSample(
+                            self._backend_model,
+                            token_buffer,
+                            c_size_t(n_token),
+                            c_float(sample_temperature),
+                            c_size_t(sample_top_k),
+                            c_float(sample_top_p),
+                            c_uint64(step_seed),
+                        )
+                    )
+                return int(
+                    LIB_LLAISYS.llaisysQwen2ModelInfer(
+                        self._backend_model,
+                        token_buffer,
+                        c_size_t(n_token),
+                    )
+                )
+
             arr = (c_int64 * len(input_ids))(*input_ids)
-            next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(input_ids))))
+            next_token = infer_with_sampling(arr, len(input_ids))
 
             output_ids = list(input_ids)
             for _ in range(max_new_tokens):
@@ -172,7 +202,7 @@ def generate(
                 if self._end_token >= 0 and next_token == self._end_token:
                     break
                 arr = (c_int64 * 1)(next_token)
-                next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(1)))
+                next_token = infer_with_sampling(arr, 1)
 
             return output_ids
 
diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py
index ed0180bc8..d44cba18f 100644
--- a/python/llaisys/ops.py
+++ b/python/llaisys/ops.py
@@ -1,6 +1,6 @@
 from .libllaisys import LIB_LLAISYS
 from .tensor import Tensor
-from ctypes import c_float, c_int
+from ctypes import c_float, c_size_t, c_uint64
 
 
 class Ops:
@@ -12,6 +12,24 @@ def add(c: Tensor, a: Tensor, b: Tensor):
     def argmax(max_idx: Tensor, max_val: Tensor, vals: Tensor):
         LIB_LLAISYS.llaisysArgmax(max_idx.lib_tensor(), max_val.lib_tensor(), vals.lib_tensor())
 
+    @staticmethod
+    def random_sample(
+        out_idx: Tensor,
+        logits: Tensor,
+        temperature: float = 1.0,
+        top_k: int = 0,
+        top_p: float = 1.0,
+        seed: int = 0,
+    ):
+        LIB_LLAISYS.llaisysRandomSample(
+            out_idx.lib_tensor(),
+            logits.lib_tensor(),
+            c_float(temperature),
+            c_size_t(max(0, int(top_k))),
+            c_float(top_p),
+            c_uint64(int(seed) & 0xFFFFFFFFFFFFFFFF),
+        )
+
     @staticmethod
     def embedding(out: Tensor, index: Tensor, weight: Tensor):
         LIB_LLAISYS.llaisysEmbedding(
diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc
index ca8de527d..025ec2322 100644
--- a/src/llaisys/ops.cc
+++ b/src/llaisys/ops.cc
@@ -4,6 +4,7 @@
 
 #include "../ops/add/op.hpp"
 #include "../ops/argmax/op.hpp"
+#include "../ops/random_sample/op.hpp"
 #include "../ops/embedding/op.hpp"
 #include "../ops/linear/op.hpp"
 #include "../ops/rearrange/op.hpp"
@@ -19,6 +20,21 @@ __C {
     void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals) {
         llaisys::ops::argmax(max_idx->tensor, max_val->tensor, vals->tensor);
     }
+    void llaisysRandomSample(
+        llaisysTensor_t out_idx,
+        llaisysTensor_t logits,
+        float temperature,
+        size_t top_k,
+        float top_p,
+        uint64_t seed) {
+        llaisys::ops::random_sample(
+            out_idx->tensor,
+            logits->tensor,
+            temperature,
+            top_k,
+            top_p,
+            seed);
+    }
     void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight) {
         llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor);
     }
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index 4166fe18f..1427d24f3 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -390,7 +390,27 @@ __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, co
     return it != model->weight_map.end() ? 1 : 0;
 }
 
-static int64_t infer_one_token(struct LlaisysQwen2Model *model, int64_t token_id) {
+static int64_t read_scalar_i64(llaisysTensor_t tensor, const LlaisysRuntimeAPI *runtime_api, int64_t fallback) {
+    if (!tensor) {
+        return fallback;
+    }
+    if (tensor->tensor->deviceType() == LLAISYS_DEVICE_CPU) {
+        int64_t *ptr = reinterpret_cast<int64_t *>(tensor->tensor->data());
+        return ptr ? ptr[0] : fallback;
+    }
+    int64_t v = fallback;
+    runtime_api->memcpy_sync(&v, tensor->tensor->data(), sizeof(v), LLAISYS_MEMCPY_D2H);
+    return v;
+}
+
+static int64_t infer_one_token(
+    struct LlaisysQwen2Model *model,
+    int64_t token_id,
+    bool do_sample,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed) {
     using namespace llaisys;
 
     if (!model->weights.in_embed || !model->weights.out_embed) {
@@ -557,24 +577,25 @@ static int64_t infer_one_token(struct LlaisysQwen2Model *model, int64_t token_id
     auto logits = make_tensor({1, model->meta.voc}, dtype, model->device_type, model->device_id);
     llaisysLinear(logits, logits_in, model->weights.out_embed, nullptr);
 
-    auto max_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
-    auto max_val = make_tensor(one_shape, dtype, model->device_type, model->device_id);
-    llaisysArgmax(max_idx, max_val, logits);
-
     int64_t next = model->meta.end_token;
-    if (max_idx->tensor->deviceType() == LLAISYS_DEVICE_CPU) {
-        int64_t *res_ptr = reinterpret_cast<int64_t *>(max_idx->tensor->data());
-        next = res_ptr ? res_ptr[0] : model->meta.end_token;
+    if (do_sample) {
+        auto sample_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
+        llaisysRandomSample(sample_idx, logits, temperature, top_k, top_p, seed);
+        next = read_scalar_i64(sample_idx, runtime_api, model->meta.end_token);
+        delete sample_idx;
     } else {
-        runtime_api->memcpy_sync(&next, max_idx->tensor->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H);
+        auto max_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
+        auto max_val = make_tensor(one_shape, dtype, model->device_type, model->device_id);
+        llaisysArgmax(max_idx, max_val, logits);
+        next = read_scalar_i64(max_idx, runtime_api, model->meta.end_token);
+        delete max_idx;
+        delete max_val;
     }
 
     delete idx;
     delete x;
     if (out_norm) delete out_norm;
     delete logits;
-    delete max_idx;
-    delete max_val;
 
     return next;
 }
@@ -585,7 +606,27 @@ __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_
 
     int64_t next = model->meta.end_token;
     for (size_t i = 0; i < ntoken; ++i) {
-        next = infer_one_token(model, token_ids[i]);
+        next = infer_one_token(model, token_ids[i], false, 1.0f, 1, 1.0f, 0);
+    }
+    return next;
+}
+
+__export int64_t llaisysQwen2ModelInferSample(
+    struct LlaisysQwen2Model * model,
+    int64_t * token_ids,
+    size_t ntoken,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed) {
+    if (!model) return -1;
+    if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token;
+
+    int64_t next = model->meta.end_token;
+    for (size_t i = 0; i < ntoken; ++i) {
+        const bool do_sample = (i + 1 == ntoken);
+        const uint64_t step_seed = seed + static_cast<uint64_t>(i);
+        next = infer_one_token(model, token_ids[i], do_sample, temperature, top_k, top_p, step_seed);
     }
     return next;
 }
diff --git a/src/ops/nvidia_cuda.cuh b/src/ops/nvidia_cuda.cuh
index 4c41bb544..70749b723 100644
--- a/src/ops/nvidia_cuda.cuh
+++ b/src/ops/nvidia_cuda.cuh
@@ -41,7 +41,8 @@ __device__ inline float to_float<llaisys::fp16_t>(llaisys::fp16_t v) {
 
 template <>
 __device__ inline float to_float<llaisys::bf16_t>(llaisys::bf16_t v) {
-    return __bfloat162float(__ushort_as_bfloat16(v._v));
+    const uint32_t bits = static_cast<uint32_t>(v._v) << 16;
+    return __uint_as_float(bits);
 }
 
 template <typename T>
@@ -62,7 +63,9 @@ __device__ inline llaisys::fp16_t from_float<llaisys::fp16_t>(float v) {
 template <>
 __device__ inline llaisys::bf16_t from_float<llaisys::bf16_t>(float v) {
     llaisys::bf16_t out;
-    out._v = __bfloat16_as_ushort(__float2bfloat16(v));
+    const uint32_t bits = __float_as_uint(v);
+    const uint32_t rounding_bias = 0x00007FFFu + ((bits >> 16) & 1u);
+    out._v = static_cast<uint16_t>((bits + rounding_bias) >> 16);
     return out;
 }
 
diff --git a/src/ops/random_sample/cpu/random_sample_cpu.cpp b/src/ops/random_sample/cpu/random_sample_cpu.cpp
new file mode 100644
index 000000000..03d382c69
--- /dev/null
+++ b/src/ops/random_sample/cpu/random_sample_cpu.cpp
@@ -0,0 +1,186 @@
+#include "random_sample_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void read_logits_to_float(
+    std::vector<float> &dst,
+    const std::byte *src,
+    size_t numel,
+    float inv_temperature) {
+    const T *typed_src = reinterpret_cast<const T *>(src);
+    dst.resize(numel);
+    for (size_t i = 0; i < numel; ++i) {
+        float v = llaisys::utils::cast<float>(typed_src[i]);
+        if (!std::isfinite(v)) {
+            v = v > 0.0f ? 1e30f : -1e30f;
+        }
+        dst[i] = v * inv_temperature;
+    }
+}
+
+static int64_t argmax_index(const std::vector<float> &logits) {
+    int64_t best_idx = 0;
+    float best_val = logits[0];
+    for (size_t i = 1; i < logits.size(); ++i) {
+        if (logits[i] > best_val) {
+            best_val = logits[i];
+            best_idx = static_cast<int64_t>(i);
+        }
+    }
+    return best_idx;
+}
+
+static int64_t sample_index_from_logits(
+    const std::vector<float> &scaled_logits,
+    size_t top_k,
+    float top_p,
+    uint64_t seed) {
+    const size_t n = scaled_logits.size();
+    if (n == 0) {
+        return 0;
+    }
+
+    if (top_k == 0 || top_k > n) {
+        top_k = n;
+    }
+    if (!std::isfinite(top_p)) {
+        top_p = 1.0f;
+    }
+    if (top_p < 0.0f) {
+        top_p = 0.0f;
+    }
+    if (top_p > 1.0f) {
+        top_p = 1.0f;
+    }
+
+    std::vector<size_t> candidates(n);
+    std::iota(candidates.begin(), candidates.end(), size_t(0));
+
+    auto by_logit_desc = [&scaled_logits](size_t lhs, size_t rhs) {
+        const float lv = scaled_logits[lhs];
+        const float rv = scaled_logits[rhs];
+        if (lv == rv) {
+            return lhs < rhs;
+        }
+        return lv > rv;
+    };
+
+    if (top_k < n) {
+        std::nth_element(
+            candidates.begin(),
+            candidates.begin() + static_cast<ptrdiff_t>(top_k),
+            candidates.end(),
+            by_logit_desc);
+        candidates.resize(top_k);
+    }
+
+    std::sort(candidates.begin(), candidates.end(), by_logit_desc);
+
+    if (top_p <= 0.0f || candidates.size() == 1) {
+        return static_cast<int64_t>(candidates[0]);
+    }
+
+    if (top_p < 1.0f) {
+        float max_logit = scaled_logits[candidates[0]];
+        std::vector<double> probs(candidates.size(), 0.0);
+        double prob_sum = 0.0;
+        for (size_t i = 0; i < candidates.size(); ++i) {
+            const double w = std::exp(static_cast<double>(scaled_logits[candidates[i]] - max_logit));
+            probs[i] = w;
+            prob_sum += w;
+        }
+
+        if (!(prob_sum > 0.0) || !std::isfinite(prob_sum)) {
+            return static_cast<int64_t>(candidates[0]);
+        }
+
+        double cumulative = 0.0;
+        size_t keep = 0;
+        for (size_t i = 0; i < candidates.size(); ++i) {
+            cumulative += probs[i] / prob_sum;
+            keep = i + 1;
+            if (cumulative >= static_cast<double>(top_p)) {
+                break;
+            }
+        }
+        if (keep == 0) {
+            keep = 1;
+        }
+        candidates.resize(keep);
+    }
+
+    float max_logit = -std::numeric_limits<float>::infinity();
+    for (size_t idx : candidates) {
+        if (scaled_logits[idx] > max_logit) {
+            max_logit = scaled_logits[idx];
+        }
+    }
+
+    std::vector<double> weights(candidates.size(), 0.0);
+    double weight_sum = 0.0;
+    for (size_t i = 0; i < candidates.size(); ++i) {
+        const double w = std::exp(static_cast<double>(scaled_logits[candidates[i]] - max_logit));
+        weights[i] = w;
+        weight_sum += w;
+    }
+
+    if (!(weight_sum > 0.0) || !std::isfinite(weight_sum)) {
+        return static_cast<int64_t>(candidates[0]);
+    }
+
+    std::mt19937_64 rng(seed);
+    std::discrete_distribution<size_t> dist(weights.begin(), weights.end());
+    return static_cast<int64_t>(candidates[dist(rng)]);
+}
+
+void random_sample(
+    std::byte *out_idx,
+    const std::byte *logits,
+    llaisysDataType_t type,
+    size_t numel,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed) {
+    ASSERT(out_idx != nullptr, "RandomSample: out_idx cannot be null.");
+    ASSERT(logits != nullptr, "RandomSample: logits cannot be null.");
+    ASSERT(numel > 0, "RandomSample: numel must be positive.");
+
+    std::vector<float> scaled_logits;
+    bool deterministic = !std::isfinite(temperature) || temperature <= 0.0f || top_k == 1 || top_p <= 0.0f;
+    const float inv_temperature = deterministic ? 1.0f : (1.0f / temperature);
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        read_logits_to_float<float>(scaled_logits, logits, numel, inv_temperature);
+        break;
+    case LLAISYS_DTYPE_F16:
+        read_logits_to_float<llaisys::fp16_t>(scaled_logits, logits, numel, inv_temperature);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        read_logits_to_float<llaisys::bf16_t>(scaled_logits, logits, numel, inv_temperature);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    int64_t sampled = 0;
+    if (deterministic) {
+        sampled = argmax_index(scaled_logits);
+    } else {
+        sampled = sample_index_from_logits(scaled_logits, top_k, top_p, seed);
+    }
+    *reinterpret_cast<int64_t *>(out_idx) = sampled;
+}
+
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/random_sample/cpu/random_sample_cpu.hpp b/src/ops/random_sample/cpu/random_sample_cpu.hpp
new file mode 100644
index 000000000..1ac127c93
--- /dev/null
+++ b/src/ops/random_sample/cpu/random_sample_cpu.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llaisys::ops::cpu {
+void random_sample(
+    std::byte *out_idx,
+    const std::byte *logits,
+    llaisysDataType_t type,
+    size_t numel,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed);
+}
diff --git a/src/ops/random_sample/nvidia/random_sample_nvidia.cu b/src/ops/random_sample/nvidia/random_sample_nvidia.cu
new file mode 100644
index 000000000..68e5de7b4
--- /dev/null
+++ b/src/ops/random_sample/nvidia/random_sample_nvidia.cu
@@ -0,0 +1,236 @@
+#include "random_sample_nvidia.hpp"
+
+#include "../../argmax/nvidia/argmax_nvidia.hpp"
+#include "../../nvidia_cuda.cuh"
+#include "../cpu/random_sample_cpu.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <vector>
+
+namespace llaisys::ops::nvidia {
+
+template <typename T>
+__global__ void cast_scale_kernel(float *out, const T *in, size_t n, float inv_temperature) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= n) {
+        return;
+    }
+    float v = to_float(in[idx]);
+    if (!isfinite(v)) {
+        v = v > 0.0f ? 1e30f : -1e30f;
+    }
+    out[idx] = v * inv_temperature;
+}
+
+template <typename T>
+void launch_cast_scale(float *out, const std::byte *in, size_t n, float inv_temperature) {
+    const int threads = num_threads_1d();
+    const int blocks = num_blocks_1d(n, threads);
+    auto stream = current_stream();
+    cast_scale_kernel<<<blocks, threads, 0, stream>>>(
+        out,
+        reinterpret_cast<const T *>(in),
+        n,
+        inv_temperature);
+    check_cuda(cudaGetLastError(), "random_sample cast_scale kernel launch");
+}
+
+__global__ void mask_one_kernel(float *vals, int64_t idx, size_t n) {
+    if (threadIdx.x == 0 && blockIdx.x == 0 && idx >= 0 && static_cast<size_t>(idx) < n) {
+        vals[idx] = -1e30f;
+    }
+}
+
+static int64_t sample_from_sorted_host(
+    const std::vector<float> &sorted_logits,
+    const std::vector<int64_t> &sorted_indices,
+    float top_p,
+    uint64_t seed) {
+    if (sorted_logits.empty() || sorted_indices.empty()) {
+        return 0;
+    }
+
+    if (!std::isfinite(top_p)) {
+        top_p = 1.0f;
+    }
+    if (top_p < 0.0f) {
+        top_p = 0.0f;
+    }
+    if (top_p > 1.0f) {
+        top_p = 1.0f;
+    }
+
+    if (top_p <= 0.0f || sorted_logits.size() == 1) {
+        return sorted_indices[0];
+    }
+
+    const float max_logit = sorted_logits[0];
+    std::vector<double> weights(sorted_logits.size(), 0.0);
+    double weight_sum = 0.0;
+    for (size_t i = 0; i < sorted_logits.size(); ++i) {
+        const double w = std::exp(static_cast<double>(sorted_logits[i] - max_logit));
+        weights[i] = w;
+        weight_sum += w;
+    }
+
+    if (!(weight_sum > 0.0) || !std::isfinite(weight_sum)) {
+        return sorted_indices[0];
+    }
+
+    size_t keep = sorted_logits.size();
+    if (top_p < 1.0f) {
+        double cumulative = 0.0;
+        keep = 0;
+        for (size_t i = 0; i < weights.size(); ++i) {
+            cumulative += weights[i] / weight_sum;
+            keep = i + 1;
+            if (cumulative >= static_cast<double>(top_p)) {
+                break;
+            }
+        }
+        if (keep == 0) {
+            keep = 1;
+        }
+    }
+
+    std::mt19937_64 rng(seed);
+    std::discrete_distribution<size_t> dist(weights.begin(), weights.begin() + static_cast<ptrdiff_t>(keep));
+    return sorted_indices[dist(rng)];
+}
+
+void random_sample(
+    std::byte *out_idx,
+    const std::byte *logits,
+    llaisysDataType_t type,
+    size_t numel,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed) {
+    ASSERT(out_idx != nullptr, "RandomSample(nvidia): out_idx cannot be null.");
+    ASSERT(logits != nullptr, "RandomSample(nvidia): logits cannot be null.");
+    ASSERT(numel > 0, "RandomSample(nvidia): numel must be positive.");
+
+    const bool deterministic = !std::isfinite(temperature) || temperature <= 0.0f || top_k == 1 || top_p <= 0.0f;
+    const float inv_temperature = deterministic ? 1.0f : (1.0f / temperature);
+
+    float *d_scaled = nullptr;
+    int64_t *d_max_idx = nullptr;
+    float *d_max_val = nullptr;
+
+    check_cuda(cudaMalloc(&d_scaled, numel * sizeof(float)), "random_sample cudaMalloc scaled");
+    check_cuda(cudaMalloc(&d_max_idx, sizeof(int64_t)), "random_sample cudaMalloc max_idx");
+    check_cuda(cudaMalloc(&d_max_val, sizeof(float)), "random_sample cudaMalloc max_val");
+
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        launch_cast_scale<float>(d_scaled, logits, numel, inv_temperature);
+        break;
+    case LLAISYS_DTYPE_F16:
+        launch_cast_scale<llaisys::fp16_t>(d_scaled, logits, numel, inv_temperature);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        launch_cast_scale<llaisys::bf16_t>(d_scaled, logits, numel, inv_temperature);
+        break;
+    default:
+        cudaFree(d_scaled);
+        cudaFree(d_max_idx);
+        cudaFree(d_max_val);
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+
+    auto stream = current_stream();
+    int64_t sampled_idx = 0;
+
+    if (deterministic) {
+        llaisys::ops::nvidia::argmax(
+            reinterpret_cast<std::byte *>(d_max_idx),
+            reinterpret_cast<std::byte *>(d_max_val),
+            reinterpret_cast<const std::byte *>(d_scaled),
+            LLAISYS_DTYPE_F32,
+            numel);
+        check_cuda(
+            cudaMemcpyAsync(&sampled_idx, d_max_idx, sizeof(sampled_idx), cudaMemcpyDeviceToHost, stream),
+            "random_sample copy deterministic idx");
+        check_cuda(cudaStreamSynchronize(stream), "random_sample sync deterministic idx");
+        check_cuda(
+            cudaMemcpy(out_idx, &sampled_idx, sizeof(sampled_idx), cudaMemcpyHostToDevice),
+            "random_sample deterministic copy idx");
+        cudaFree(d_scaled);
+        cudaFree(d_max_idx);
+        cudaFree(d_max_val);
+        return;
+    }
+
+    size_t k = top_k;
+    if (k == 0 || k > numel) {
+        k = numel;
+    }
+
+    // Avoid O(k * vocab) kernel loops for very large k by falling back to single D2H copy.
+    constexpr size_t kIterativeTopKLimit = 1024;
+    if (k > kIterativeTopKLimit) {
+        std::vector<float> host_scaled(numel);
+        check_cuda(
+            cudaMemcpyAsync(host_scaled.data(), d_scaled, numel * sizeof(float), cudaMemcpyDeviceToHost, stream),
+            "random_sample fallback copy scaled");
+        check_cuda(cudaStreamSynchronize(stream), "random_sample fallback sync copy scaled");
+        cpu::random_sample(
+            reinterpret_cast<std::byte *>(&sampled_idx),
+            reinterpret_cast<const std::byte *>(host_scaled.data()),
+            LLAISYS_DTYPE_F32,
+            numel,
+            1.0f,
+            top_k,
+            top_p,
+            seed);
+    } else {
+        std::vector<float> top_logits;
+        std::vector<int64_t> top_indices;
+        top_logits.reserve(k);
+        top_indices.reserve(k);
+
+        for (size_t i = 0; i < k; ++i) {
+            float h_max_val = -1e30f;
+            int64_t h_max_idx = 0;
+
+            llaisys::ops::nvidia::argmax(
+                reinterpret_cast<std::byte *>(d_max_idx),
+                reinterpret_cast<std::byte *>(d_max_val),
+                reinterpret_cast<const std::byte *>(d_scaled),
+                LLAISYS_DTYPE_F32,
+                numel);
+
+            check_cuda(
+                cudaMemcpyAsync(&h_max_idx, d_max_idx, sizeof(h_max_idx), cudaMemcpyDeviceToHost, stream),
+                "random_sample copy topk idx");
+            check_cuda(
+                cudaMemcpyAsync(&h_max_val, d_max_val, sizeof(h_max_val), cudaMemcpyDeviceToHost, stream),
+                "random_sample copy topk val");
+            check_cuda(cudaStreamSynchronize(stream), "random_sample sync topk pair");
+
+            top_indices.push_back(h_max_idx);
+            top_logits.push_back(h_max_val);
+
+            mask_one_kernel<<<1, 1, 0, stream>>>(d_scaled, h_max_idx, numel);
+            check_cuda(cudaGetLastError(), "random_sample mask_one kernel launch");
+        }
+
+        sampled_idx = sample_from_sorted_host(top_logits, top_indices, top_p, seed);
+    }
+
+    check_cuda(
+        cudaMemcpy(out_idx, &sampled_idx, sizeof(sampled_idx), cudaMemcpyHostToDevice),
+        "random_sample sampled copy idx");
+
+    cudaFree(d_scaled);
+    cudaFree(d_max_idx);
+    cudaFree(d_max_val);
+}
+
+} // namespace llaisys::ops::nvidia
diff --git a/src/ops/random_sample/nvidia/random_sample_nvidia.hpp b/src/ops/random_sample/nvidia/random_sample_nvidia.hpp
new file mode 100644
index 000000000..8f14e99a2
--- /dev/null
+++ b/src/ops/random_sample/nvidia/random_sample_nvidia.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llaisys::ops::nvidia {
+void random_sample(
+    std::byte *out_idx,
+    const std::byte *logits,
+    llaisysDataType_t type,
+    size_t numel,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed);
+}
diff --git a/src/ops/random_sample/op.cpp b/src/ops/random_sample/op.cpp
new file mode 100644
index 000000000..37d3ee839
--- /dev/null
+++ b/src/ops/random_sample/op.cpp
@@ -0,0 +1,83 @@
+#include "op.hpp"
+
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/random_sample_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/random_sample_nvidia.hpp"
+#endif
+
+#include <vector>
+
+namespace llaisys::ops {
+
+void random_sample(
+    tensor_t out_idx,
+    tensor_t logits,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed) {
+    CHECK_SAME_DEVICE(out_idx, logits);
+    ASSERT(out_idx->dtype() == LLAISYS_DTYPE_I64, "RandomSample: out_idx must be Int64.");
+    ASSERT(out_idx->numel() == 1, "RandomSample: out_idx must have one element.");
+    ASSERT(logits->isContiguous(), "RandomSample: logits must be contiguous.");
+    ASSERT(logits->numel() > 0, "RandomSample: logits must be non-empty.");
+
+    if (logits->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::random_sample(
+            out_idx->data(),
+            logits->data(),
+            logits->dtype(),
+            logits->numel(),
+            temperature,
+            top_k,
+            top_p,
+            seed);
+    }
+
+    llaisys::core::context().setDevice(logits->deviceType(), logits->deviceId());
+
+#ifdef ENABLE_NVIDIA_API
+    if (logits->deviceType() == LLAISYS_DEVICE_NVIDIA) {
+        return nvidia::random_sample(
+            out_idx->data(),
+            logits->data(),
+            logits->dtype(),
+            logits->numel(),
+            temperature,
+            top_k,
+            top_p,
+            seed);
+    }
+#endif
+
+    const LlaisysRuntimeAPI *runtime_api = llaisys::core::context().runtime().api();
+
+    const size_t logits_bytes = logits->numel() * logits->elementSize();
+    std::vector<std::byte> host_logits(logits_bytes);
+    runtime_api->memcpy_sync(
+        host_logits.data(),
+        logits->data(),
+        logits_bytes,
+        LLAISYS_MEMCPY_D2H);
+
+    int64_t sampled_idx = 0;
+    cpu::random_sample(
+        reinterpret_cast<std::byte *>(&sampled_idx),
+        host_logits.data(),
+        logits->dtype(),
+        logits->numel(),
+        temperature,
+        top_k,
+        top_p,
+        seed);
+
+    runtime_api->memcpy_sync(
+        out_idx->data(),
+        &sampled_idx,
+        sizeof(sampled_idx),
+        LLAISYS_MEMCPY_H2D);
+}
+
+} // namespace llaisys::ops
diff --git a/src/ops/random_sample/op.hpp b/src/ops/random_sample/op.hpp
new file mode 100644
index 000000000..900713224
--- /dev/null
+++ b/src/ops/random_sample/op.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../../tensor/tensor.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llaisys::ops {
+void random_sample(
+    tensor_t out_idx,
+    tensor_t logits,
+    float temperature,
+    size_t top_k,
+    float top_p,
+    uint64_t seed);
+}
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index bbd2e13c3..b0871adb8 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -2,8 +2,21 @@ target("llaisys-device-nvidia")
     set_kind("static")
     set_languages("cxx17")
     set_warnings("all", "error")
-    add_includedirs("/usr/local/cuda/include")
-    add_linkdirs("/usr/local/cuda/lib64")
+    local cuda_sdk = get_config("cuda")
+    if cuda_sdk then
+        local cuda_include = path.join(cuda_sdk, "include")
+        local cuda_lib64 = path.join(cuda_sdk, "lib64")
+        local cuda_lib = path.join(cuda_sdk, "lib")
+        if os.isdir(cuda_include) then
+            add_includedirs(cuda_include)
+        end
+        if os.isdir(cuda_lib64) then
+            add_linkdirs(cuda_lib64)
+        end
+        if os.isdir(cuda_lib) then
+            add_linkdirs(cuda_lib)
+        end
+    end
     add_links("cudart")
     if not is_plat("windows") then
         add_cxflags("-fPIC", "-Wno-unknown-pragmas")
@@ -23,11 +36,24 @@ target("llaisys-ops-nvidia")
 
     set_languages("cxx17")
     set_warnings("all", "error")
-    add_includedirs("/usr/local/cuda/include")
-    add_linkdirs("/usr/local/cuda/lib64")
+    local cuda_sdk = get_config("cuda")
+    if cuda_sdk then
+        local cuda_include = path.join(cuda_sdk, "include")
+        local cuda_lib64 = path.join(cuda_sdk, "lib64")
+        local cuda_lib = path.join(cuda_sdk, "lib")
+        if os.isdir(cuda_include) then
+            add_includedirs(cuda_include)
+        end
+        if os.isdir(cuda_lib64) then
+            add_linkdirs(cuda_lib64)
+        end
+        if os.isdir(cuda_lib) then
+            add_linkdirs(cuda_lib)
+        end
+    end
     add_links("cudart", "cublas")
     if not is_plat("windows") then
-        add_cuflags("-Xcompiler=-fPIC")
+        add_cuflags("-Xcompiler=-fPIC", {force = true})
     end
 
     add_files("../src/ops/*/nvidia/*.cu")

From a4c40d2571da9eaf80e7fa0930824dcdcc773cc5 Mon Sep 17 00:00:00 2001
From: mengyijia49 <mengyijia49@gmail.com>
Date: Mon, 16 Mar 2026 16:28:30 +0800
Subject: [PATCH 8/9] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=A4=9A=E4=BD=99?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/check_exports.py  |   7 ---
 scripts/smoke_generate.py | 106 --------------------------------------
 scripts/smoke_qwen2.py    |  54 -------------------
 scripts/test_ctypes.py    |  15 ------
 4 files changed, 182 deletions(-)
 delete mode 100644 scripts/check_exports.py
 delete mode 100644 scripts/smoke_generate.py
 delete mode 100644 scripts/smoke_qwen2.py
 delete mode 100644 scripts/test_ctypes.py

diff --git a/scripts/check_exports.py b/scripts/check_exports.py
deleted file mode 100644
index 9e5b03549..000000000
--- a/scripts/check_exports.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import ctypes
-lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll')
-print('handle=', lib._handle)
-GetProcAddress=ctypes.windll.kernel32.GetProcAddress
-print('create=', GetProcAddress(lib._handle, b'llaisysQwen2ModelCreate'))
-print('tensor=', GetProcAddress(lib._handle, b'tensorCreate'))
-print('done')
diff --git a/scripts/smoke_generate.py b/scripts/smoke_generate.py
deleted file mode 100644
index 6de800534..000000000
--- a/scripts/smoke_generate.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import ctypes
-from ctypes import c_size_t, c_int, c_int64, c_void_p, c_char_p, POINTER
-import numpy as np
-import sys
-
-DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll'
-print('Loading DLL', DLL)
-lib = ctypes.CDLL(DLL)
-
-# prototypes
-lib.tensorCreate.argtypes = [POINTER(c_size_t), c_size_t, c_int, c_int, c_int]
-lib.tensorCreate.restype = c_void_p
-lib.tensorLoad.argtypes = [c_void_p, c_void_p]
-lib.tensorLoad.restype = None
-lib.tensorDestroy.argtypes = [c_void_p]
-lib.tensorDestroy.restype = None
-
-lib.llaisysQwen2ModelCreate.argtypes = [POINTER(ctypes.c_void_p), c_int, POINTER(c_int), c_int]
-# We'll not use this prototype; define a simpler one matching earlier header
-# But to be safe use c_void_p for meta pointer by building a struct in Python not trivial; instead use direct Create with bytes
-
-# Define meta struct layout in Python using ctypes
-class Meta(ctypes.Structure):
-    _fields_ = [
-        ('dtype', c_int),
-        ('nlayer', c_size_t),
-        ('hs', c_size_t),
-        ('nh', c_size_t),
-        ('nkvh', c_size_t),
-        ('dh', c_size_t),
-        ('di', c_size_t),
-        ('maxseq', c_size_t),
-        ('voc', c_size_t),
-        ('epsilon', ctypes.c_float),
-        ('theta', ctypes.c_float),
-        ('end_token', c_int64),
-    ]
-
-lib.llaisysQwen2ModelCreate.argtypes = [POINTER(Meta), c_int, POINTER(c_int), c_int]
-lib.llaisysQwen2ModelCreate.restype = c_void_p
-lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p]
-lib.llaisysQwen2ModelSetWeight.restype = c_int
-lib.llaisysQwen2ModelFinalize.argtypes = [c_void_p]
-lib.llaisysQwen2ModelFinalize.restype = c_int
-lib.llaisysQwen2ModelInfer.argtypes = [c_void_p, POINTER(c_int64), c_size_t]
-lib.llaisysQwen2ModelInfer.restype = c_int64
-
-# Create small meta
-meta = Meta()
-meta.dtype = 13  # F32
-meta.nlayer = 0  # no layers to avoid accessing per-layer weights
-meta.hs = 16
-meta.nh = 4
-meta.nkvh = 4
-meta.dh = 4
-meta.di = 64
-meta.maxseq = 128
-meta.voc = 100
-meta.epsilon = 1e-5
-meta.theta = 1.0
-meta.end_token = -1
-
-print('Creating model')
-model = lib.llaisysQwen2ModelCreate(ctypes.byref(meta), 0, None, 0)
-if not model:
-    print('Model create failed', file=sys.stderr)
-    sys.exit(2)
-print('Model ptr', model)
-
-# create in_embed tensor shape [voc, hs]
-voc = int(meta.voc)
-hs = int(meta.hs)
-shape = (c_size_t * 2)(voc, hs)
-emb_tensor = lib.tensorCreate(shape, 2, 13, 0, 0)
-# fill with random floats
-arr = (np.random.rand(voc, hs).astype(np.float32)).ctypes
-lib.tensorLoad(emb_tensor, arr.data)
-print('in_embed created')
-
-# create out_embed tensor shape [voc, hs]
-shape2 = (c_size_t * 2)(voc, hs)
-out_tensor = lib.tensorCreate(shape2, 2, 13, 0, 0)
-arr2 = (np.random.rand(hs, voc).astype(np.float32)).ctypes
-lib.tensorLoad(out_tensor, arr2.data)
-print('out_embed created')
-
-# set weights
-ret = lib.llaisysQwen2ModelSetWeight(model, b'embed_tokens.weight', emb_tensor)
-print('set in_embed', ret)
-ret = lib.llaisysQwen2ModelSetWeight(model, b'lm_head.weight', out_tensor)
-print('set out_embed', ret)
-
-# finalize
-lib.llaisysQwen2ModelFinalize(model)
-print('finalized')
-
-# infer on token ids [1,2,3]
-seq = (c_int64 * 3)(1, 2, 3)
-nexttok = lib.llaisysQwen2ModelInfer(model, seq, 3)
-print('next token ->', nexttok)
-
-# cleanup
-lib.tensorDestroy(emb_tensor)
-lib.tensorDestroy(out_tensor)
-lib.llaisysQwen2ModelDestroy(ctypes.c_void_p(model))
-print('done')
diff --git a/scripts/smoke_qwen2.py b/scripts/smoke_qwen2.py
deleted file mode 100644
index aacc7b285..000000000
--- a/scripts/smoke_qwen2.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import ctypes
-from ctypes import Structure, POINTER, c_size_t, c_int, c_int64, c_float, c_void_p
-import sys
-
-DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll'
-print('Loading DLL:', DLL)
-lib = ctypes.CDLL(DLL)
-
-class LlaisysQwen2Meta(Structure):
-    _fields_ = [
-        ('dtype', c_int),
-        ('nlayer', c_size_t),
-        ('hs', c_size_t),
-        ('nh', c_size_t),
-        ('nkvh', c_size_t),
-        ('dh', c_size_t),
-        ('di', c_size_t),
-        ('maxseq', c_size_t),
-        ('voc', c_size_t),
-        ('epsilon', c_float),
-        ('theta', c_float),
-        ('end_token', c_int64),
-    ]
-
-# prototypes
-lib.llaisysQwen2ModelCreate.argtypes = [POINTER(LlaisysQwen2Meta), c_int, POINTER(c_int), c_int]
-lib.llaisysQwen2ModelCreate.restype = c_void_p
-lib.llaisysQwen2ModelDestroy.argtypes = [c_void_p]
-lib.llaisysQwen2ModelDestroy.restype = None
-
-m = LlaisysQwen2Meta()
-m.dtype = 13  # LLAISYS_DTYPE_F32
-m.nlayer = 1
-m.hs = 16
-m.nh = 4
-m.nkvh = 4
-m.dh = 4
-m.di = 64
-m.maxseq = 128
-m.voc = 1000
-m.epsilon = 1e-5
-m.theta = 1.0
-m.end_token = -1
-
-print('Calling ll_create...')
-model = lib.llaisysQwen2ModelCreate(ctypes.byref(m), 0, None, 0)
-print('ll_create returned:', model)
-if not model:
-    print('create failed', file=sys.stderr)
-    sys.exit(2)
-
-print('Calling ll_destroy...')
-lib.llaisysQwen2ModelDestroy(model)
-print('destroy ok')
diff --git a/scripts/test_ctypes.py b/scripts/test_ctypes.py
deleted file mode 100644
index b236c22bd..000000000
--- a/scripts/test_ctypes.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import ctypes
-lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll')
-print('has create attr:', hasattr(lib, 'llaisysQwen2ModelCreate'))
-try:
-    f = lib.llaisysQwen2ModelCreate
-    print('got create:', f)
-except Exception as e:
-    print('error getting create:', e)
-
-print('has tensorCreate:', hasattr(lib, 'tensorCreate'))
-try:
-    f2 = lib.tensorCreate
-    print('got tensorCreate:', f2)
-except Exception as e:
-    print('error getting tensorCreate:', e)

From 8ddebee5af27312e6df0925314c2a974c5829aed Mon Sep 17 00:00:00 2001
From: mengyijia49 <mengyijia49@gmail.com>
Date: Mon, 16 Mar 2026 16:45:01 +0800
Subject: [PATCH 9/9] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=9B=9E=E9=80=80?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/llaisys/models/qwen2.py | 148 ++++++++++++++-------------------
 1 file changed, 62 insertions(+), 86 deletions(-)

diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 6d840ce1b..bd9aa4546 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -12,12 +12,6 @@
 except Exception:
     torch = None
 
-try:
-    from transformers import AutoModelForCausalLM
-    HF_AVAILABLE = True
-except Exception:
-    HF_AVAILABLE = False
-
 
 class Qwen2:
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
@@ -119,16 +113,7 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
             if missing:
                 print("[llaisys qwen2] Warning: missing weights:", missing)
         except Exception as e:
-            # backend unavailable or error during loading; fall back to HF
-            print(f"[llaisys qwen2] backend load failed: {e}")
-            self._backend_model = None
-
-        if self._backend_model is None:
-            if not HF_AVAILABLE:
-                raise RuntimeError("Neither backend nor HuggingFace available for Qwen2 model")
-            self.device = torch.device("cpu" if device == DeviceType.CPU else ("cuda" if torch.cuda.is_available() else "cpu"))
-            self.model = AutoModelForCausalLM.from_pretrained(str(model_path), trust_remote_code=True, torch_dtype=torch.bfloat16)
-            self.model.to(self.device)
+            raise RuntimeError(f"[llaisys qwen2] backend load failed: {e}") from e
 
     def generate(
         self,
@@ -139,83 +124,74 @@ def generate(
         temperature: float = 0.8,
         seed: int = None,
     ):
-        if self._backend_model is not None:
-            import ctypes
-            from ctypes import c_float, c_int64, c_size_t, c_uint64
-
-            input_ids = [int(t) for t in inputs]
-            if not input_ids:
-                return []
-
-            if max_new_tokens is None:
-                max_new_tokens = 128
-            max_new_tokens = int(max_new_tokens)
-            if max_new_tokens <= 0:
-                return input_ids
-
-            kv_cap = len(input_ids) + max_new_tokens
-            if self._maxseq > 0:
-                kv_cap = min(kv_cap, self._maxseq)
-            kv_cap = max(kv_cap, 1)
-
-            if self._backend_kv is not None:
-                LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv)
-                self._backend_kv = None
-            self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap))
-
-            sample_api_available = hasattr(LIB_LLAISYS, "llaisysQwen2ModelInferSample")
-            sample_top_k = max(0, int(top_k))
-            sample_top_p = float(top_p)
-            sample_temperature = float(temperature)
-            rng = random.Random() if seed is None else random.Random(int(seed))
-
-            def infer_with_sampling(token_buffer, n_token):
-                if sample_api_available:
-                    step_seed = rng.getrandbits(64)
-                    return int(
-                        LIB_LLAISYS.llaisysQwen2ModelInferSample(
-                            self._backend_model,
-                            token_buffer,
-                            c_size_t(n_token),
-                            c_float(sample_temperature),
-                            c_size_t(sample_top_k),
-                            c_float(sample_top_p),
-                            c_uint64(step_seed),
-                        )
-                    )
+        if self._backend_model is None:
+            raise RuntimeError("Qwen2 backend is not initialized")
+
+        import ctypes
+        from ctypes import c_float, c_int64, c_size_t, c_uint64
+
+        input_ids = [int(t) for t in inputs]
+        if not input_ids:
+            return []
+
+        if max_new_tokens is None:
+            max_new_tokens = 128
+        max_new_tokens = int(max_new_tokens)
+        if max_new_tokens <= 0:
+            return input_ids
+
+        kv_cap = len(input_ids) + max_new_tokens
+        if self._maxseq > 0:
+            kv_cap = min(kv_cap, self._maxseq)
+        kv_cap = max(kv_cap, 1)
+
+        if self._backend_kv is not None:
+            LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv)
+            self._backend_kv = None
+        self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap))
+
+        sample_api_available = hasattr(LIB_LLAISYS, "llaisysQwen2ModelInferSample")
+        sample_top_k = max(0, int(top_k))
+        sample_top_p = float(top_p)
+        sample_temperature = float(temperature)
+        rng = random.Random() if seed is None else random.Random(int(seed))
+
+        def infer_with_sampling(token_buffer, n_token):
+            if sample_api_available:
+                step_seed = rng.getrandbits(64)
                 return int(
-                    LIB_LLAISYS.llaisysQwen2ModelInfer(
+                    LIB_LLAISYS.llaisysQwen2ModelInferSample(
                         self._backend_model,
                         token_buffer,
                         c_size_t(n_token),
+                        c_float(sample_temperature),
+                        c_size_t(sample_top_k),
+                        c_float(sample_top_p),
+                        c_uint64(step_seed),
                     )
                 )
-
-            arr = (c_int64 * len(input_ids))(*input_ids)
-            next_token = infer_with_sampling(arr, len(input_ids))
-
-            output_ids = list(input_ids)
-            for _ in range(max_new_tokens):
-                if next_token is None:
-                    break
-                output_ids.append(next_token)
-                if self._end_token >= 0 and next_token == self._end_token:
-                    break
-                arr = (c_int64 * 1)(next_token)
-                next_token = infer_with_sampling(arr, 1)
-
-            return output_ids
-
-        input_ids = torch.tensor([list(inputs)], dtype=torch.long, device=self.device)
-        with torch.no_grad():
-            outputs = self.model.generate(
-                input_ids,
-                max_new_tokens=max_new_tokens,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
+            return int(
+                LIB_LLAISYS.llaisysQwen2ModelInfer(
+                    self._backend_model,
+                    token_buffer,
+                    c_size_t(n_token),
+                )
             )
-        return outputs[0].tolist()
+
+        arr = (c_int64 * len(input_ids))(*input_ids)
+        next_token = infer_with_sampling(arr, len(input_ids))
+
+        output_ids = list(input_ids)
+        for _ in range(max_new_tokens):
+            if next_token is None:
+                break
+            output_ids.append(next_token)
+            if self._end_token >= 0 and next_token == self._end_token:
+                break
+            arr = (c_int64 * 1)(next_token)
+            next_token = infer_with_sampling(arr, 1)
+
+        return output_ids
 
     def __del__(self):
         try: