From e4c31c0c21d19aa78739a7fb68b5b882284ab9ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= Date: Mon, 26 Jan 2026 16:09:56 +0800 Subject: [PATCH 1/9] =?UTF-8?q?=E5=AE=8C=E6=88=90=20Assignment=20#1:=20?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=20Tensor=20=E5=85=83=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E6=93=8D=E4=BD=9C=20(load,=20isContiguous,=20view,=20permute,?= =?UTF-8?q?=20slice)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tensor/tensor.cpp | 58 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 2f594bb65..1d4daa48b 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -164,27 +164,67 @@ void Tensor::debug() const { } bool Tensor::isContiguous() const { - TO_BE_IMPLEMENTED(); + // 逻辑:从最后一个维度开始检查 + // 每一个维度的步长(stride)应该等于它后方所有维度大小(shape)的乘积 + size_t expected_stride = 1; + for (int i = (int)ndim() - 1; i >= 0; --i) { + if (shape()[i] > 1) { // 维度大小为 1 的可以跳过,因为它不影响连续性 + if (strides()[i] != (ptrdiff_t)expected_stride) { + return false; + } + expected_stride *= shape()[i]; + } + } return true; } tensor_t Tensor::permute(const std::vector &order) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + std::vector new_shape; + std::vector new_strides; + for (auto i : order) { + new_shape.push_back(_meta.shape[i]); + new_strides.push_back(_meta.strides[i]); + } + return std::shared_ptr(new Tensor({dtype(), new_shape, new_strides}, _storage, _offset)); } tensor_t Tensor::view(const std::vector &shape) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // 检查元素总数是否相等 + size_t new_numel = 1; + for (auto s : shape) new_numel *= s; + if (new_numel != this->numel()) throw std::runtime_error("view: numel mismatch"); + + // 基础作业要求:非连续张量不允许 view + if (!this->isContiguous()) throw std::runtime_error("view: tensor is not contiguous"); + + // 计算新步长 + std::vector new_strides(shape.size()); + size_t st = 1; + for (int i = (int)shape.size() - 1; i >= 0; --i) { + new_strides[i] = st; + st *= shape[i]; + } + return std::shared_ptr(new Tensor({dtype(), shape, new_strides}, _storage, _offset)); } tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // 新的偏移量计算:原偏移 + 开始位置 * 步长 * 单个元素字节数 + size_t new_offset = _offset + start * _meta.strides[dim] * elementSize(); + std::vector new_shape = _meta.shape; + new_shape[dim] = end - start; + return std::shared_ptr(new Tensor({dtype(), new_shape, _meta.strides}, _storage, new_offset)); } void Tensor::load(const void *src_) { - TO_BE_IMPLEMENTED(); + size_t bytes = this->numel() * this->elementSize(); + // 设置当前设备并执行拷贝 + core::context().setDevice(this->deviceType(), this->deviceId()); + core::context().runtime().api()->memcpy_sync( + this->data(), // 目标地址:由 data() 计算得出(含 offset) + src_, // 源地址 + bytes, + LLAISYS_MEMCPY_H2D // Host to Device + ); } tensor_t Tensor::contiguous() const { @@ -202,4 +242,4 @@ tensor_t Tensor::to(llaisysDeviceType_t device_type, int device) const { return std::shared_ptr(new Tensor(_meta, _storage)); } -} // namespace llaisys +} // namespace llaisys \ No newline at end of file From e5e8af7fc6f88712b2062aabd33d7ba2dc8fe32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= Date: Mon, 26 Jan 2026 16:19:32 +0800 Subject: [PATCH 2/9] =?UTF-8?q?=E8=A1=A5=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tensor/tensor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 1d4daa48b..4bf89c00d 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -209,6 +209,7 @@ tensor_t Tensor::view(const std::vector &shape) const { tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { // 新的偏移量计算:原偏移 + 开始位置 * 步长 * 单个元素字节数 + // size_t new_offset = _offset + start * _meta.strides[dim] * elementSize(); std::vector new_shape = _meta.shape; new_shape[dim] = end - start; From 1a37e8580aaca0cf1d5bd5509fff36b2cca2307b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= Date: Tue, 27 Jan 2026 16:30:49 +0800 Subject: [PATCH 3/9] feat: implement all core operators for Task-2 (Argmax, Embedding, Linear, RMSNorm, RoPE, Self-Attention, SwiGLU, Rearrange) --- src/ops/argmax/cpu/argmax_cpu.cpp | 45 ++++++++++ src/ops/argmax/cpu/argmax_cpu.hpp | 9 ++ src/ops/argmax/op.cpp | 18 +++- src/ops/embedding/cpu/embedding_cpu.cpp | 38 ++++++++ src/ops/embedding/cpu/embedding_cpu.hpp | 8 ++ src/ops/embedding/op.cpp | 27 +++++- src/ops/linear/cpu/linear_cpu.cpp | 43 +++++++++ src/ops/linear/cpu/linear_cpu.hpp | 8 ++ src/ops/linear/op.cpp | 29 ++++++- src/ops/rearrange/cpu/rearrange_cpu.cpp | 52 +++++++++++ src/ops/rearrange/cpu/rearrange_cpu.hpp | 11 +++ src/ops/rearrange/op.cpp | 17 +++- src/ops/rms_norm/cpu/rms_norm_cpu.cpp | 47 ++++++++++ src/ops/rms_norm/cpu/rms_norm_cpu.hpp | 8 ++ src/ops/rms_norm/op.cpp | 24 ++++- src/ops/rope/cpu/rope_cpu.cpp | 59 +++++++++++++ src/ops/rope/cpu/rope_cpu.hpp | 8 ++ src/ops/rope/op.cpp | 26 +++++- .../self_attention/cpu/self_attention_cpu.cpp | 87 +++++++++++++++++++ .../self_attention/cpu/self_attention_cpu.hpp | 9 ++ src/ops/self_attention/op.cpp | 26 +++++- src/ops/swiglu/cpu/swiglu_cpu.cpp | 36 ++++++++ src/ops/swiglu/cpu/swiglu_cpu.hpp | 8 ++ src/ops/swiglu/op.cpp | 26 +++++- 24 files changed, 653 insertions(+), 16 deletions(-) create mode 100644 src/ops/argmax/cpu/argmax_cpu.cpp create mode 100644 src/ops/argmax/cpu/argmax_cpu.hpp create mode 100644 src/ops/embedding/cpu/embedding_cpu.cpp create mode 100644 src/ops/embedding/cpu/embedding_cpu.hpp create mode 100644 src/ops/linear/cpu/linear_cpu.cpp create mode 100644 src/ops/linear/cpu/linear_cpu.hpp create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.cpp create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.hpp create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.cpp create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.hpp create mode 100644 src/ops/rope/cpu/rope_cpu.cpp create mode 100644 src/ops/rope/cpu/rope_cpu.hpp create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.cpp create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.hpp create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.cpp create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.hpp diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp new file mode 100644 index 000000000..1d5128ca0 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.cpp @@ -0,0 +1,45 @@ +#include "argmax_cpu.hpp" +#include "../../../utils.hpp" +#include + +template +void argmax_(int64_t *max_idx, T *max_val, const T *vals, size_t numel) { + if (numel == 0) return; + + // 初始化:假设第一个元素就是最大的 + T current_max = vals[0]; + int64_t current_idx = 0; + + for (size_t i = 1; i < numel; i++) { + // 使用 cast 转为 float 比较,以支持 fp16/bf16 + if (llaisys::utils::cast(vals[i]) > llaisys::utils::cast(current_max)) { + current_max = vals[i]; + current_idx = static_cast(i); + } + } + + *max_val = current_max; + *max_idx = current_idx; +} + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, + llaisysDataType_t type, size_t numel) { + // max_idx 固定为 I64 + int64_t *idx_ptr = reinterpret_cast(max_idx); + + switch (type) { + case LLAISYS_DTYPE_F32: + return argmax_(idx_ptr, reinterpret_cast(max_val), + reinterpret_cast(vals), numel); + case LLAISYS_DTYPE_BF16: + return argmax_(idx_ptr, reinterpret_cast(max_val), + reinterpret_cast(vals), numel); + case LLAISYS_DTYPE_F16: + return argmax_(idx_ptr, reinterpret_cast(max_val), + reinterpret_cast(vals), numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp new file mode 100644 index 000000000..e9972f92f --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +// max_idx 存储索引,通常是 int64_t;max_val 和 vals 类型相同 +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, + llaisysDataType_t type, size_t numel); +} \ No newline at end of file diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d426..8cafa32de 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,21 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/argmax_cpu.hpp" // 记得包含刚才写的新头文件 namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(max_idx, max_val, vals); + // 注意:根据任务要求,max_idx 和 max_val 是包含单元素的 1D 张量 + ASSERT(max_idx->numel() == 1 && max_val->numel() == 1, "Argmax: outputs must have 1 element."); + ASSERT(vals->isContiguous(), "Argmax: input tensor must be contiguous."); + + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), + vals->dtype(), vals->numel()); + } + + // 如果有其他设备,可以在这里继续 switch... + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} // namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp new file mode 100644 index 000000000..1768d50d3 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.cpp @@ -0,0 +1,38 @@ +#include "embedding_cpu.hpp" +#include "../../../utils.hpp" +#include +#include + +template +void embedding_(T *out, const int64_t *index, const T *weight, size_t num_indices, size_t embedding_dim) { + for (size_t i = 0; i < num_indices; ++i) { + int64_t idx = index[i]; + // 将 weight 中的第 idx 行复制到 out 中的第 i 行 + const T *src = weight + (idx * embedding_dim); + T *dst = out + (i * embedding_dim); + std::memcpy(dst, src, embedding_dim * sizeof(T)); + } +} + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, + llaisysDataType_t type, size_t num_indices, size_t embedding_dim) { + + // index 固定为 Int64 (int64_t) + const int64_t *idx_ptr = reinterpret_cast(index); + + switch (type) { + case LLAISYS_DTYPE_F32: + return embedding_(reinterpret_cast(out), idx_ptr, + reinterpret_cast(weight), num_indices, embedding_dim); + case LLAISYS_DTYPE_BF16: + return embedding_(reinterpret_cast(out), idx_ptr, + reinterpret_cast(weight), num_indices, embedding_dim); + case LLAISYS_DTYPE_F16: + return embedding_(reinterpret_cast(out), idx_ptr, + reinterpret_cast(weight), num_indices, embedding_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp new file mode 100644 index 000000000..c5cbc5461 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, + llaisysDataType_t type, size_t num_indices, size_t embedding_dim); +} \ No newline at end of file diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d06..1008838f0 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,30 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/embedding_cpu.hpp" namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, index, weight); + + // 任务要求:index 必须是 Int64 类型 + ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "Embedding: index must be Int64."); + // 确保 weight 和 out 的数据类型一致 + CHECK_SAME_DTYPE(out->dtype(), weight->dtype()); + + // 简单的连续性检查 + ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), + "Embedding: all tensors must be contiguous."); + + // 获取维度信息 + size_t num_indices = index->numel(); + size_t embedding_dim = weight->shape().back(); // weight 的最后一维是词向量长度 + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding(out->data(), index->data(), weight->data(), + out->dtype(), num_indices, embedding_dim); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} // namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp new file mode 100644 index 000000000..77ad1c7b9 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.cpp @@ -0,0 +1,43 @@ +#include "linear_cpu.hpp" +#include "../../../utils.hpp" + +template +void linear_(T *out, const T *in, const T *weight, const T *bias, size_t M, size_t N, size_t K) { + for (size_t i = 0; i < M; ++i) { // 遍历 X 的行 + for (size_t j = 0; j < N; ++j) { // 遍历 W 的行 (即 Y 的列) + float sum = 0.0f; + for (size_t k = 0; k < K; ++k) { // 内积计算 + // X[i, k] * W[j, k] (因为是 W^T,所以 W 也是取第 j 行第 k 列) + float x_val = llaisys::utils::cast(in[i * K + k]); + float w_val = llaisys::utils::cast(weight[j * K + k]); + sum += x_val * w_val; + } + + // 处理可选的 bias: b[j] + if (bias) { + sum += llaisys::utils::cast(bias[j]); + } + + out[i * N + j] = llaisys::utils::cast(sum); + } + } +} + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, + llaisysDataType_t type, size_t M, size_t N, size_t K) { + switch (type) { + case LLAISYS_DTYPE_F32: + return linear_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight), reinterpret_cast(bias), M, N, K); + case LLAISYS_DTYPE_BF16: + return linear_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight), reinterpret_cast(bias), M, N, K); + case LLAISYS_DTYPE_F16: + return linear_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight), reinterpret_cast(bias), M, N, K); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp new file mode 100644 index 000000000..7c9652b23 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, + llaisysDataType_t type, size_t M, size_t N, size_t K); +} \ No newline at end of file diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f8655..5222917c3 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,32 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/linear_cpu.hpp" namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + if (bias) CHECK_SAME_DEVICE(out, bias); + + // 形状校验 + size_t M = in->shape()[0]; + size_t K = in->shape()[1]; + size_t N = weight->shape()[0]; + + ASSERT(weight->shape()[1] == K, "Linear: K dimension mismatch."); + ASSERT(out->shape()[0] == M && out->shape()[1] == N, "Linear: output shape mismatch."); + + // 连续性校验 + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), + "Linear: all main tensors must be contiguous."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + // 注意:bias 可能为 nullptr,直接传入 data() 即可(底层做了 if(bias) 判断) + const std::byte *bias_ptr = bias ? bias->data() : nullptr; + return cpu::linear(out->data(), in->data(), weight->data(), bias_ptr, + out->dtype(), M, N, K); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} // namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp new file mode 100644 index 000000000..eadb7d4b8 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp @@ -0,0 +1,52 @@ +#include "rearrange_cpu.hpp" +#include "../../../utils.hpp" + +template +void rearrange_(T *out, const T *in, const std::vector& shape, + const std::vector& in_strides, + const std::vector& out_strides) { + size_t rank = shape.size(); + size_t total_elements = 1; + for (auto s : shape) total_elements *= s; + + std::vector current_idx(rank, 0); + + for (size_t i = 0; i < total_elements; ++i) { + // 使用 ptrdiff_t 计算偏移,以匹配传入的步长类型 + ptrdiff_t in_offset = 0; + ptrdiff_t out_offset = 0; + for (size_t d = 0; d < rank; ++d) { + in_offset += static_cast(current_idx[d]) * in_strides[d]; + out_offset += static_cast(current_idx[d]) * out_strides[d]; + } + + out[out_offset] = in[in_offset]; + + for (int d = static_cast(rank) - 1; d >= 0; --d) { + current_idx[d]++; + if (current_idx[d] < shape[d]) break; + current_idx[d] = 0; + } + } +} + +namespace llaisys::ops::cpu { +void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, + const std::vector& shape, + const std::vector& in_strides, + const std::vector& out_strides) { + switch (type) { + case LLAISYS_DTYPE_F32: + rearrange_(reinterpret_cast(out), reinterpret_cast(in), shape, in_strides, out_strides); + break; + case LLAISYS_DTYPE_BF16: + rearrange_(reinterpret_cast(out), reinterpret_cast(in), shape, in_strides, out_strides); + break; + case LLAISYS_DTYPE_F16: + rearrange_(reinterpret_cast(out), reinterpret_cast(in), shape, in_strides, out_strides); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp new file mode 100644 index 000000000..9b94fb912 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp @@ -0,0 +1,11 @@ +#pragma once +#include "llaisys.h" +#include +#include + +namespace llaisys::ops::cpu { +void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, + const std::vector& shape, + const std::vector& in_strides, + const std::vector& out_strides); +} \ No newline at end of file diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp index 017a6ae59..5542d1345 100644 --- a/src/ops/rearrange/op.cpp +++ b/src/ops/rearrange/op.cpp @@ -1,7 +1,20 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/rearrange_cpu.hpp" namespace llaisys::ops { void rearrange(tensor_t out, tensor_t in) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + ASSERT(out->shape() == in->shape(), "Rearrange: shape mismatch."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + // 直接传递,类型现在匹配 std::vector + return cpu::rearrange(out->data(), in->data(), out->dtype(), + in->shape(), in->strides(), out->strides()); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} \ No newline at end of file diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp new file mode 100644 index 000000000..b3902f8a0 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp @@ -0,0 +1,47 @@ +#include "rms_norm_cpu.hpp" +#include "../../../utils.hpp" +#include + +template +void rms_norm_(T *out, const T *in, const T *weight, size_t M, size_t d, float eps) { + for (size_t i = 0; i < M; ++i) { + float sum_sq = 0.0f; + const T* row_in = in + i * d; + T* row_out = out + i * d; + + // 1. 计算当前行元素的平方和 + for (size_t j = 0; j < d; ++j) { + float val = llaisys::utils::cast(row_in[j]); + sum_sq += val * val; + } + + // 2. 计算 RMS (均方根) + float rms = std::sqrt(sum_sq / static_cast(d) + eps); + + // 3. 归一化并应用权重 W + for (size_t j = 0; j < d; ++j) { + float val = llaisys::utils::cast(row_in[j]); + float w = llaisys::utils::cast(weight[j]); + row_out[j] = llaisys::utils::cast(w * val / rms); + } + } +} + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, + llaisysDataType_t type, size_t M, size_t d, float eps) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight), M, d, eps); + case LLAISYS_DTYPE_BF16: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight), M, d, eps); + case LLAISYS_DTYPE_F16: + return rms_norm_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(weight), M, d, eps); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp new file mode 100644 index 000000000..96222e486 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, + llaisysDataType_t type, size_t M, size_t d, float eps); +} \ No newline at end of file diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9d..9b0005857 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,27 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/rms_norm_cpu.hpp" namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + + // 获取维度信息 + // 假设输入 X 为 [M, d],权重 W 为 [d] + size_t M = in->shape()[0]; + size_t d = in->shape()[1]; + + ASSERT(weight->numel() == d, "RMSNorm: weight size must match the last dimension of input."); + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), + "RMSNorm: all tensors must be contiguous."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm(out->data(), in->data(), weight->data(), + out->dtype(), M, d, eps); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} // namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp new file mode 100644 index 000000000..b8e78a620 --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.cpp @@ -0,0 +1,59 @@ +#include "rope_cpu.hpp" +#include "../../../utils.hpp" +#include +#include + +template +void rope_(T *out, const T *in, const int64_t *pos_ids, + size_t seqlen, size_t nhead, size_t d, float theta) { + size_t half_d = d / 2; + + for (size_t i = 0; i < seqlen; ++i) { + float pos = static_cast(pos_ids[i]); // pi + + for (size_t h = 0; h < nhead; ++h) { + for (size_t j = 0; j < half_d; ++j) { + // 计算角度 phi_i,j = pi / (theta^(2j/d)) + float freq = pos / std::pow(theta, static_cast(2 * j) / d); + float cos_phi = std::cos(freq); + float sin_phi = std::sin(freq); + + // 获取 a 和 b。a 是前半部分,b 是对应的后半部分 + // 索引定位:[seq_idx, head_idx, dim_idx] + size_t base_idx = i * (nhead * d) + h * d; + size_t idx_a = base_idx + j; + size_t idx_b = base_idx + j + half_d; + + float a = llaisys::utils::cast(in[idx_a]); + float b = llaisys::utils::cast(in[idx_b]); + + // 计算旋转结果 + // a' = a*cos - b*sin + // b' = b*cos + a*sin + out[idx_a] = llaisys::utils::cast(a * cos_phi - b * sin_phi); + out[idx_b] = llaisys::utils::cast(b * cos_phi + a * sin_phi); + } + } + } +} + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + llaisysDataType_t type, size_t seqlen, size_t nhead, size_t d, float theta) { + const int64_t *p_ids = reinterpret_cast(pos_ids); + + switch (type) { + case LLAISYS_DTYPE_F32: + return rope_(reinterpret_cast(out), reinterpret_cast(in), + p_ids, seqlen, nhead, d, theta); + case LLAISYS_DTYPE_BF16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), + p_ids, seqlen, nhead, d, theta); + case LLAISYS_DTYPE_F16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), + p_ids, seqlen, nhead, d, theta); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp new file mode 100644 index 000000000..c6e92ecd9 --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + llaisysDataType_t type, size_t seqlen, size_t nhead, size_t d, float theta); +} \ No newline at end of file diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64e..b56241b59 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,29 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/rope_cpu.hpp" namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, pos_ids); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be Int64."); + + // 获取形状:[seqlen, nhead, d] + size_t seqlen = in->shape()[0]; + size_t nhead = in->shape()[1]; + size_t d = in->shape()[2]; + + ASSERT(pos_ids->numel() == seqlen, "RoPE: pos_ids length must match seqlen."); + ASSERT(d % 2 == 0, "RoPE: head dimension d must be even."); + ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), + "RoPE: all tensors must be contiguous."); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope(out->data(), in->data(), pos_ids->data(), + out->dtype(), seqlen, nhead, d, theta); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} // namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp new file mode 100644 index 000000000..73116754e --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp @@ -0,0 +1,87 @@ +#include "self_attention_cpu.hpp" +#include "../../../utils.hpp" +#include +#include +#include + +template +void self_attention_(T *attn_val, const T *q, const T *k, const T *v, + size_t seqlen, size_t total_len, size_t nhead, size_t nkvhead, + size_t d, size_t dv, float scale) { + size_t n_groups = nhead / nkvhead; // GQA 分组大小 + + for (size_t h = 0; h < nhead; ++h) { + size_t h_kv = h / n_groups; // 对应的 KV 头索引 + + for (size_t i = 0; i < seqlen; ++i) { + std::vector scores(total_len); + float max_score = -INFINITY; + + // 1. 计算 QK^T * scale 并应用因果掩码 + size_t q_idx_base = (i * nhead + h) * d; + for (size_t j = 0; j < total_len; ++j) { + // 因果掩码:当前位置 i 只能看到过去的位置 j + if (j > (total_len - seqlen + i)) { + scores[j] = -INFINITY; + continue; + } + + float dot = 0.0f; + size_t k_idx_base = (j * nkvhead + h_kv) * d; + for (size_t c = 0; c < d; ++c) { + dot += llaisys::utils::cast(q[q_idx_base + c]) * llaisys::utils::cast(k[k_idx_base + c]); + } + scores[j] = dot * scale; + max_score = std::max(max_score, scores[j]); + } + + // 2. Softmax 归一化 + float sum_exp = 0.0f; + for (size_t j = 0; j < total_len; ++j) { + if (scores[j] != -INFINITY) { + scores[j] = std::exp(scores[j] - max_score); + sum_exp += scores[j]; + } else { + scores[j] = 0.0f; + } + } + for (size_t j = 0; j < total_len; ++j) scores[j] /= sum_exp; + + // 3. 计算 Softmax(A) * V + size_t out_idx_base = (i * nhead + h) * dv; + for (size_t cdv = 0; cdv < dv; ++cdv) { + float res = 0.0f; + for (size_t j = 0; j < total_len; ++j) { + if (scores[j] > 0.0f) { + size_t v_idx = (j * nkvhead + h_kv) * dv + cdv; + res += scores[j] * llaisys::utils::cast(v[v_idx]); + } + } + attn_val[out_idx_base + cdv] = llaisys::utils::cast(res); + } + } + } +} + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + llaisysDataType_t type, size_t seqlen, size_t total_len, + size_t nhead, size_t nkvhead, size_t d, size_t dv, float scale) { + switch (type) { + case LLAISYS_DTYPE_F32: + return self_attention_(reinterpret_cast(attn_val), reinterpret_cast(q), + reinterpret_cast(k), reinterpret_cast(v), + seqlen, total_len, nhead, nkvhead, d, dv, scale); + case LLAISYS_DTYPE_BF16: + return self_attention_(reinterpret_cast(attn_val), reinterpret_cast(q), + reinterpret_cast(k), reinterpret_cast(v), + seqlen, total_len, nhead, nkvhead, d, dv, scale); + case LLAISYS_DTYPE_F16: + return self_attention_(reinterpret_cast(attn_val), reinterpret_cast(q), + reinterpret_cast(k), reinterpret_cast(v), + seqlen, total_len, nhead, nkvhead, d, dv, scale); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp new file mode 100644 index 000000000..eaf0a0150 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + llaisysDataType_t type, size_t seqlen, size_t total_len, + size_t nhead, size_t nkvhead, size_t d, size_t dv, float scale); +} \ No newline at end of file diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d620142..b139964de 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,29 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/self_attention_cpu.hpp" namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(attn_val, q, k, v); + + // 提取维度 + size_t seqlen = q->shape()[0]; + size_t nhead = q->shape()[1]; + size_t d = q->shape()[2]; + + size_t total_len = k->shape()[0]; + size_t nkvhead = k->shape()[1]; + size_t dv = v->shape()[2]; + + ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(), + "SelfAttention: all tensors must be contiguous."); + + if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} \ No newline at end of file diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp new file mode 100644 index 000000000..7ac2727c4 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp @@ -0,0 +1,36 @@ +#include "swiglu_cpu.hpp" +#include "../../../utils.hpp" +#include + +template +void swiglu_(T *out, const T *gate, const T *up, size_t num_elements) { + for (size_t i = 0; i < num_elements; ++i) { + float g = llaisys::utils::cast(gate[i]); + float u = llaisys::utils::cast(up[i]); + + // SiLU(gate) = gate / (1 + exp(-gate)) + float silu_gate = g / (1.0f + std::exp(-g)); + + // out = up * SiLU(gate) + out[i] = llaisys::utils::cast(u * silu_gate); + } +} + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, + llaisysDataType_t type, size_t num_elements) { + switch (type) { + case LLAISYS_DTYPE_F32: + return swiglu_(reinterpret_cast(out), reinterpret_cast(gate), + reinterpret_cast(up), num_elements); + case LLAISYS_DTYPE_BF16: + return swiglu_(reinterpret_cast(out), reinterpret_cast(gate), + reinterpret_cast(up), num_elements); + case LLAISYS_DTYPE_F16: + return swiglu_(reinterpret_cast(out), reinterpret_cast(gate), + reinterpret_cast(up), num_elements); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp new file mode 100644 index 000000000..2b1a4c12b --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, + llaisysDataType_t type, size_t num_elements); +} \ No newline at end of file diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 47edbcc97..7f224b214 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -1,7 +1,29 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/swiglu_cpu.hpp" namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { - TO_BE_IMPLEMENTED(); + // 基础检查 + CHECK_SAME_DEVICE(out, gate, up); + CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype()); + + // 形状检查 + ASSERT(out->shape() == gate->shape() && out->shape() == up->shape(), + "SwigLU: out, gate, and up must have the same shape."); + + // 连续性检查 + ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(), + "SwigLU: all tensors must be contiguous."); + + size_t num_elements = out->numel(); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::swiglu(out->data(), gate->data(), up->data(), + out->dtype(), num_elements); + } + + EXCEPTION_UNSUPPORTED_DEVICE; } -} // namespace llaisys::ops +} \ No newline at end of file From c1856a03d74f1567fc71a04c84e4b18c1a732cd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= Date: Wed, 28 Jan 2026 14:44:41 +0800 Subject: [PATCH 4/9] qwen2: implement attention+KV, weight validation, and fixes; smoke tests passed --- include/llaisys/models/qwen2.h | 15 + python/llaisys/libllaisys/__init__.py | 2 + python/llaisys/libllaisys/qwen2.py | 90 +++++ python/llaisys/models/qwen2.py | 119 +++++- scripts/check_exports.py | 7 + scripts/check_libllaisys_import.py | 15 + scripts/import_llaisys.py | 5 + scripts/smoke_generate.py | 106 +++++ scripts/smoke_qwen2.py | 54 +++ scripts/test_ctypes.py | 15 + src/llaisys/ops.cc | 2 +- src/llaisys/qwen2.cc | 558 ++++++++++++++++++++++++++ xmake.lua | 1 + 13 files changed, 975 insertions(+), 14 deletions(-) create mode 100644 python/llaisys/libllaisys/qwen2.py create mode 100644 scripts/check_exports.py create mode 100644 scripts/check_libllaisys_import.py create mode 100644 scripts/import_llaisys.py create mode 100644 scripts/smoke_generate.py create mode 100644 scripts/smoke_qwen2.py create mode 100644 scripts/test_ctypes.py create mode 100644 src/llaisys/qwen2.cc diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h index 7054626d4..296f0f15f 100644 --- a/include/llaisys/models/qwen2.h +++ b/include/llaisys/models/qwen2.h @@ -37,6 +37,21 @@ __C { __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model); + // Set a named weight tensor into the model. Returns 0 on success. + __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor); + + // Optional finalize call after all weights are set. + __export int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model); + + // Check whether a named weight has been set. Returns 1 if present, 0 otherwise. + __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name); + __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + + // KV cache APIs + __export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens); + __export void llaisysQwen2KVDestroy(void *kv); + __export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v); + __export size_t llaisysQwen2KVLen(void *kv); } #endif // LLAISYS_MODELS_QWEN2_H diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb527..7e7511f9c 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -12,6 +12,7 @@ from .tensor import llaisysTensor_t from .tensor import load_tensor from .ops import load_ops +from .qwen2 import load_qwen2 def load_shared_library(): @@ -38,6 +39,7 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) +load_qwen2(LIB_LLAISYS) __all__ = [ diff --git a/python/llaisys/libllaisys/qwen2.py b/python/llaisys/libllaisys/qwen2.py new file mode 100644 index 000000000..0bd15295c --- /dev/null +++ b/python/llaisys/libllaisys/qwen2.py @@ -0,0 +1,90 @@ +from ctypes import ( + Structure, + POINTER, + c_int, + c_size_t, + c_float, + c_int64, + c_void_p, + c_char_p, +) + +from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t + + +class LlaisysQwen2Meta(Structure): + _fields_ = [ + ("dtype", llaisysDataType_t), + ("nlayer", c_size_t), + ("hs", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("maxseq", c_size_t), + ("voc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_int64), + ] + + +def load_qwen2(lib): + # llasiysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) + if hasattr(lib, 'llaisysQwen2ModelCreate'): + lib.llaisysQwen2ModelCreate.argtypes = [POINTER(LlaisysQwen2Meta), llaisysDeviceType_t, POINTER(c_int), c_int] + lib.llaisysQwen2ModelCreate.restype = c_void_p + else: + print('[libllaisys.qwen2] Warning: llaisysQwen2ModelCreate not found in shared lib') + + # void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model); + if hasattr(lib, 'llaisysQwen2ModelDestroy'): + lib.llaisysQwen2ModelDestroy.argtypes = [c_void_p] + lib.llaisysQwen2ModelDestroy.restype = None + + # struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model); + if hasattr(lib, 'llaisysQwen2ModelWeights'): + lib.llaisysQwen2ModelWeights.argtypes = [c_void_p] + lib.llaisysQwen2ModelWeights.restype = c_void_p + + # int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + if hasattr(lib, 'llaisysQwen2ModelInfer'): + lib.llaisysQwen2ModelInfer.argtypes = [c_void_p, POINTER(c_int64), c_size_t] + lib.llaisysQwen2ModelInfer.restype = c_int64 + else: + print('[libllaisys.qwen2] Warning: llaisysQwen2ModelInfer not found in shared lib') + + # int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor); + if hasattr(lib, 'llaisysQwen2ModelSetWeight'): + lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p] + lib.llaisysQwen2ModelSetWeight.restype = c_int + + # int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model); + if hasattr(lib, 'llaisysQwen2ModelFinalize'): + lib.llaisysQwen2ModelFinalize.argtypes = [c_void_p] + lib.llaisysQwen2ModelFinalize.restype = c_int + + # uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name); + if hasattr(lib, 'llaisysQwen2ModelHasWeight'): + lib.llaisysQwen2ModelHasWeight.argtypes = [c_void_p, c_char_p] + lib.llaisysQwen2ModelHasWeight.restype = c_int + + # KV cache prototypes + if hasattr(lib, 'llaisysQwen2KVCreat'): + lib.llaisysQwen2KVCreat.argtypes = [c_void_p, c_size_t] + lib.llaisysQwen2KVCreat.restype = c_void_p + + if hasattr(lib, 'llaisysQwen2KVDestroy'): + lib.llaisysQwen2KVDestroy.argtypes = [c_void_p] + lib.llaisysQwen2KVDestroy.restype = None + + if hasattr(lib, 'llaisysQwen2KVAppend'): + lib.llaisysQwen2KVAppend.argtypes = [c_void_p, c_void_p, c_void_p] + lib.llaisysQwen2KVAppend.restype = c_int + + if hasattr(lib, 'llaisysQwen2KVLen'): + lib.llaisysQwen2KVLen.argtypes = [c_void_p] + lib.llaisysQwen2KVLen.restype = c_size_t + + +__all__ = ["LlaisysQwen2Meta", "load_qwen2"] diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b21..033fcd9e3 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,23 +1,95 @@ from typing import Sequence -from ..libllaisys import LIB_LLAISYS -from ..libllaisys import DeviceType - +from ..libllaisys import LIB_LLAISYS, DeviceType, DataType +from .. import Tensor +from ..libllaisys.qwen2 import LlaisysQwen2Meta from pathlib import Path + import safetensors +import numpy as np +try: + import torch + from transformers import AutoModelForCausalLM + HF_AVAILABLE = True +except Exception: + HF_AVAILABLE = False -class Qwen2: +class Qwen2: def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor - model_path = Path(model_path) + self._backend_model = None + + # attempt to create backend model + try: + # try to read model config if present to populate meta + cfg = {} + cfg_path = model_path / "config.json" + if cfg_path.exists(): + import json + + with open(cfg_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + + meta = LlaisysQwen2Meta() + meta.dtype = DataType.BF16 + # robustly extract fields from config + meta.nlayer = int(cfg.get("num_hidden_layers", cfg.get("n_layer", cfg.get("num_layers", 0)))) + meta.hs = int(cfg.get("hidden_size", cfg.get("d_model", 0))) + meta.nh = int(cfg.get("num_attention_heads", cfg.get("n_head", 0))) + meta.nkvh = int(cfg.get("num_key_value_heads", meta.nh if meta.nh else 0)) + if meta.nkvh == 0: + meta.nkvh = meta.nh + meta.dh = int(cfg.get("head_dim", int(meta.hs / meta.nh) if meta.nh else 0)) + meta.di = int(cfg.get("intermediate_size", cfg.get("ffn_dim", meta.hs * 4 if meta.hs else 0))) + meta.maxseq = int(cfg.get("max_position_embeddings", cfg.get("max_seq_len", 2048))) + meta.voc = int(cfg.get("vocab_size", cfg.get("vocab_size", 0))) + meta.epsilon = float(cfg.get("layer_norm_eps", cfg.get("eps", 1e-5))) + meta.theta = float(cfg.get("rope_theta", cfg.get("theta", 10000.0))) + meta.end_token = int(cfg.get("eos_token_id", cfg.get("end_token", -1))) - for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") - for name_ in data_.keys(): - ## TODO: load the model weights - pass + dev = DeviceType.CPU if device == DeviceType.CPU else DeviceType.NVIDIA + import ctypes + + self._backend_model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), dev, None, 0) + + for file in sorted(model_path.glob("*.safetensors")): + data_ = safetensors.safe_open(file, framework="numpy", device="cpu") + for name_ in data_.keys(): + arr = data_.get_tensor(name_) + if not arr.flags["C_CONTIGUOUS"]: + arr = np.ascontiguousarray(arr) + if arr.dtype != np.float32: + arr = arr.astype(np.float32) + + t = Tensor(shape=arr.shape, dtype=DataType.F32, device=DeviceType.CPU) + t.load(arr.ctypes.data) + LIB_LLAISYS.llaisysQwen2ModelSetWeight(self._backend_model, name_.encode("utf-8"), t.lib_tensor()) + + LIB_LLAISYS.llaisysQwen2ModelFinalize(self._backend_model) + + # verify some required weights exist (best-effort) + required = [b"model.norm.weight", b"embed_tokens.weight", b"lm_head.weight"] + missing = [] + for r in required: + try: + has = LIB_LLAISYS.llaisysQwen2ModelHasWeight(self._backend_model, r) + except Exception: + has = 0 + if not has: + missing.append(r.decode("utf-8")) + if missing: + print("[llaisys qwen2] Warning: missing weights:", missing) + except Exception as e: + # backend unavailable or error during loading; fall back to HF + self._backend_model = None + + if self._backend_model is None: + if not HF_AVAILABLE: + raise RuntimeError("Neither backend nor HuggingFace available for Qwen2 model") + self.device = torch.device("cpu" if device == DeviceType.CPU else ("cuda" if torch.cuda.is_available() else "cpu")) + self.model = AutoModelForCausalLM.from_pretrained(str(model_path), trust_remote_code=True, torch_dtype=torch.bfloat16) + self.model.to(self.device) def generate( self, @@ -27,7 +99,28 @@ def generate( top_p: float = 0.8, temperature: float = 0.8, ): + if self._backend_model is not None: + import ctypes + from ctypes import c_int64, c_size_t + + arr = (c_int64 * len(inputs))(*inputs) + out = LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(inputs))) + return [int(out)] - # TODO: Implement generate function + input_ids = torch.tensor([list(inputs)], dtype=torch.long, device=self.device) + with torch.no_grad(): + outputs = self.model.generate( + input_ids, + max_new_tokens=max_new_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ) + return outputs[0].tolist() - return [] + def __del__(self): + try: + if self._backend_model is not None: + LIB_LLAISYS.llaisysQwen2ModelDestroy(self._backend_model) + except Exception: + pass diff --git a/scripts/check_exports.py b/scripts/check_exports.py new file mode 100644 index 000000000..9e5b03549 --- /dev/null +++ b/scripts/check_exports.py @@ -0,0 +1,7 @@ +import ctypes +lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll') +print('handle=', lib._handle) +GetProcAddress=ctypes.windll.kernel32.GetProcAddress +print('create=', GetProcAddress(lib._handle, b'llaisysQwen2ModelCreate')) +print('tensor=', GetProcAddress(lib._handle, b'tensorCreate')) +print('done') diff --git a/scripts/check_libllaisys_import.py b/scripts/check_libllaisys_import.py new file mode 100644 index 000000000..81338b5e3 --- /dev/null +++ b/scripts/check_libllaisys_import.py @@ -0,0 +1,15 @@ +import sys +sys.path.insert(0, 'python') +import llaisys.libllaisys as libmod +lib = libmod.LIB_LLAISYS +print('loaded lib handle:', getattr(lib, '_name', None), lib._handle) +print('has create:', hasattr(lib, 'llaisysQwen2ModelCreate')) +try: + f = lib.llaisysQwen2ModelCreate + print('callable repr:', f) +except Exception as e: + print('error getting create:', e) + +print('available names via dir (sample):') +names = [n for n in dir(lib) if 'Qwen2' in n or 'tensorCreate' in n] +print(names) diff --git a/scripts/import_llaisys.py b/scripts/import_llaisys.py new file mode 100644 index 000000000..6c944bb27 --- /dev/null +++ b/scripts/import_llaisys.py @@ -0,0 +1,5 @@ +import sys +sys.path.insert(0, 'python') +import llaisys +print('import ok') +print('has qwen create', hasattr(llaisys.libllaisys.LIB_LLAISYS, 'llaisysQwen2ModelCreate')) diff --git a/scripts/smoke_generate.py b/scripts/smoke_generate.py new file mode 100644 index 000000000..6de800534 --- /dev/null +++ b/scripts/smoke_generate.py @@ -0,0 +1,106 @@ +import ctypes +from ctypes import c_size_t, c_int, c_int64, c_void_p, c_char_p, POINTER +import numpy as np +import sys + +DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll' +print('Loading DLL', DLL) +lib = ctypes.CDLL(DLL) + +# prototypes +lib.tensorCreate.argtypes = [POINTER(c_size_t), c_size_t, c_int, c_int, c_int] +lib.tensorCreate.restype = c_void_p +lib.tensorLoad.argtypes = [c_void_p, c_void_p] +lib.tensorLoad.restype = None +lib.tensorDestroy.argtypes = [c_void_p] +lib.tensorDestroy.restype = None + +lib.llaisysQwen2ModelCreate.argtypes = [POINTER(ctypes.c_void_p), c_int, POINTER(c_int), c_int] +# We'll not use this prototype; define a simpler one matching earlier header +# But to be safe use c_void_p for meta pointer by building a struct in Python not trivial; instead use direct Create with bytes + +# Define meta struct layout in Python using ctypes +class Meta(ctypes.Structure): + _fields_ = [ + ('dtype', c_int), + ('nlayer', c_size_t), + ('hs', c_size_t), + ('nh', c_size_t), + ('nkvh', c_size_t), + ('dh', c_size_t), + ('di', c_size_t), + ('maxseq', c_size_t), + ('voc', c_size_t), + ('epsilon', ctypes.c_float), + ('theta', ctypes.c_float), + ('end_token', c_int64), + ] + +lib.llaisysQwen2ModelCreate.argtypes = [POINTER(Meta), c_int, POINTER(c_int), c_int] +lib.llaisysQwen2ModelCreate.restype = c_void_p +lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p] +lib.llaisysQwen2ModelSetWeight.restype = c_int +lib.llaisysQwen2ModelFinalize.argtypes = [c_void_p] +lib.llaisysQwen2ModelFinalize.restype = c_int +lib.llaisysQwen2ModelInfer.argtypes = [c_void_p, POINTER(c_int64), c_size_t] +lib.llaisysQwen2ModelInfer.restype = c_int64 + +# Create small meta +meta = Meta() +meta.dtype = 13 # F32 +meta.nlayer = 0 # no layers to avoid accessing per-layer weights +meta.hs = 16 +meta.nh = 4 +meta.nkvh = 4 +meta.dh = 4 +meta.di = 64 +meta.maxseq = 128 +meta.voc = 100 +meta.epsilon = 1e-5 +meta.theta = 1.0 +meta.end_token = -1 + +print('Creating model') +model = lib.llaisysQwen2ModelCreate(ctypes.byref(meta), 0, None, 0) +if not model: + print('Model create failed', file=sys.stderr) + sys.exit(2) +print('Model ptr', model) + +# create in_embed tensor shape [voc, hs] +voc = int(meta.voc) +hs = int(meta.hs) +shape = (c_size_t * 2)(voc, hs) +emb_tensor = lib.tensorCreate(shape, 2, 13, 0, 0) +# fill with random floats +arr = (np.random.rand(voc, hs).astype(np.float32)).ctypes +lib.tensorLoad(emb_tensor, arr.data) +print('in_embed created') + +# create out_embed tensor shape [voc, hs] +shape2 = (c_size_t * 2)(voc, hs) +out_tensor = lib.tensorCreate(shape2, 2, 13, 0, 0) +arr2 = (np.random.rand(hs, voc).astype(np.float32)).ctypes +lib.tensorLoad(out_tensor, arr2.data) +print('out_embed created') + +# set weights +ret = lib.llaisysQwen2ModelSetWeight(model, b'embed_tokens.weight', emb_tensor) +print('set in_embed', ret) +ret = lib.llaisysQwen2ModelSetWeight(model, b'lm_head.weight', out_tensor) +print('set out_embed', ret) + +# finalize +lib.llaisysQwen2ModelFinalize(model) +print('finalized') + +# infer on token ids [1,2,3] +seq = (c_int64 * 3)(1, 2, 3) +nexttok = lib.llaisysQwen2ModelInfer(model, seq, 3) +print('next token ->', nexttok) + +# cleanup +lib.tensorDestroy(emb_tensor) +lib.tensorDestroy(out_tensor) +lib.llaisysQwen2ModelDestroy(ctypes.c_void_p(model)) +print('done') diff --git a/scripts/smoke_qwen2.py b/scripts/smoke_qwen2.py new file mode 100644 index 000000000..aacc7b285 --- /dev/null +++ b/scripts/smoke_qwen2.py @@ -0,0 +1,54 @@ +import ctypes +from ctypes import Structure, POINTER, c_size_t, c_int, c_int64, c_float, c_void_p +import sys + +DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll' +print('Loading DLL:', DLL) +lib = ctypes.CDLL(DLL) + +class LlaisysQwen2Meta(Structure): + _fields_ = [ + ('dtype', c_int), + ('nlayer', c_size_t), + ('hs', c_size_t), + ('nh', c_size_t), + ('nkvh', c_size_t), + ('dh', c_size_t), + ('di', c_size_t), + ('maxseq', c_size_t), + ('voc', c_size_t), + ('epsilon', c_float), + ('theta', c_float), + ('end_token', c_int64), + ] + +# prototypes +lib.llaisysQwen2ModelCreate.argtypes = [POINTER(LlaisysQwen2Meta), c_int, POINTER(c_int), c_int] +lib.llaisysQwen2ModelCreate.restype = c_void_p +lib.llaisysQwen2ModelDestroy.argtypes = [c_void_p] +lib.llaisysQwen2ModelDestroy.restype = None + +m = LlaisysQwen2Meta() +m.dtype = 13 # LLAISYS_DTYPE_F32 +m.nlayer = 1 +m.hs = 16 +m.nh = 4 +m.nkvh = 4 +m.dh = 4 +m.di = 64 +m.maxseq = 128 +m.voc = 1000 +m.epsilon = 1e-5 +m.theta = 1.0 +m.end_token = -1 + +print('Calling ll_create...') +model = lib.llaisysQwen2ModelCreate(ctypes.byref(m), 0, None, 0) +print('ll_create returned:', model) +if not model: + print('create failed', file=sys.stderr) + sys.exit(2) + +print('Calling ll_destroy...') +lib.llaisysQwen2ModelDestroy(model) +print('destroy ok') diff --git a/scripts/test_ctypes.py b/scripts/test_ctypes.py new file mode 100644 index 000000000..b236c22bd --- /dev/null +++ b/scripts/test_ctypes.py @@ -0,0 +1,15 @@ +import ctypes +lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll') +print('has create attr:', hasattr(lib, 'llaisysQwen2ModelCreate')) +try: + f = lib.llaisysQwen2ModelCreate + print('got create:', f) +except Exception as e: + print('error getting create:', e) + +print('has tensorCreate:', hasattr(lib, 'tensorCreate')) +try: + f2 = lib.tensorCreate + print('got tensorCreate:', f2) +except Exception as e: + print('error getting tensorCreate:', e) diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc index c99fbc32f..ca8de527d 100644 --- a/src/llaisys/ops.cc +++ b/src/llaisys/ops.cc @@ -23,7 +23,7 @@ __C { llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor); } void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias) { - llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor); + llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias ? bias->tensor : nullptr); } void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in) { llaisys::ops::rearrange(out->tensor, in->tensor); diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc new file mode 100644 index 000000000..b57e1ab6c --- /dev/null +++ b/src/llaisys/qwen2.cc @@ -0,0 +1,558 @@ +#include "llaisys/models/qwen2.h" +#include "llaisys/tensor.h" +#include "llaisys_tensor.hpp" +#include +#include +#include +#include +#include +#include +#include "llaisys/ops.h" + +struct LlaisysQwen2Model { + LlaisysQwen2Meta meta; + LlaisysQwen2Weights weights; + // store arbitrary named tensors provided from python + std::unordered_map weight_map; + // simple KV cache pointer (optional) + // not owning tensors, just storing handles + struct KVCache *kv_cache = nullptr; +}; + +extern "C" { + +__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) { + if (!meta) return nullptr; + LlaisysQwen2Model *m = new LlaisysQwen2Model(); + m->meta = *meta; + memset(&m->weights, 0, sizeof(m->weights)); + return m; +} + +__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) { + if (!model) return; + model->weight_map.clear(); + if (model->weights.attn_norm_w) delete[] model->weights.attn_norm_w; + if (model->weights.attn_q_w) delete[] model->weights.attn_q_w; + if (model->weights.attn_q_b) delete[] model->weights.attn_q_b; + if (model->weights.attn_k_w) delete[] model->weights.attn_k_w; + if (model->weights.attn_k_b) delete[] model->weights.attn_k_b; + if (model->weights.attn_v_w) delete[] model->weights.attn_v_w; + if (model->weights.attn_v_b) delete[] model->weights.attn_v_b; + if (model->weights.attn_o_w) delete[] model->weights.attn_o_w; + if (model->weights.mlp_norm_w) delete[] model->weights.mlp_norm_w; + if (model->weights.mlp_gate_w) delete[] model->weights.mlp_gate_w; + if (model->weights.mlp_up_w) delete[] model->weights.mlp_up_w; + if (model->weights.mlp_down_w) delete[] model->weights.mlp_down_w; + delete model; +} + +__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) { + if (!model) return nullptr; + return &model->weights; +} + +static bool str_contains_any(const std::string &s, const std::vector &subs) { + for (auto &p : subs) if (s.find(p) != std::string::npos) return true; + return false; +} + +static bool tensor_matches_shape_and_dtype(llaisysTensor_t t, const std::vector &expected_shape, llaisysDataType_t dtype) { + if (!t) return false; + try { + auto &s = t->tensor->shape(); + if (s.size() != expected_shape.size()) return false; + for (size_t i = 0; i < s.size(); ++i) if (s[i] != expected_shape[i]) return false; + if (t->tensor->dtype() != dtype) return false; + return true; + } catch (...) { + return false; + } +} + +__export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor) { + if (!model || !name) return -1; + std::string sname(name); + model->weight_map[sname] = tensor; + + // top-level mapping heuristics + if (str_contains_any(sname, {"embed_tokens", "word_embeddings", "tok_embeddings", "token_embedding", "embed."})) { + std::vector exp = {model->meta.voc, model->meta.hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.in_embed = tensor; + else std::cerr << "[llaisys qwen2] Warning: in_embed shape/dtype mismatch for " << sname << std::endl; + return 0; + } + if (str_contains_any(sname, {"lm_head", "output_projection", "out_proj", "out_embed", "head"})) { + std::vector exp = {model->meta.voc, model->meta.hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.out_embed = tensor; + else std::cerr << "[llaisys qwen2] Warning: out_embed shape/dtype mismatch for " << sname << std::endl; + return 0; + } + if (str_contains_any(sname, {"model.norm.weight", "ln_f.weight", "final_layernorm.weight", "out_norm.weight", "norm.weight"})) { + // avoid matching layer-norm inside layers by checking common patterns + std::vector exp = {model->meta.hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.out_norm_w = tensor; + else std::cerr << "[llaisys qwen2] Warning: out_norm_w shape/dtype mismatch for " << sname << std::endl; + return 0; + } + + // per-layer: try to find layer index using several common patterns + std::smatch m; + std::vector layer_patterns = { + std::regex("layers\\.(\\d+)"), + std::regex("blocks\\.(\\d+)"), + std::regex("h\\.(\\d+)") + }; + int idx = -1; + for (auto &rp : layer_patterns) { + if (std::regex_search(sname, m, rp)) { idx = std::stoi(m[1].str()); break; } + } + if (idx >= 0 && (size_t)idx < model->meta.nlayer) { + // per-layer mapping with shape/dtype validation + size_t hs = model->meta.hs; + size_t nh = model->meta.nh; + size_t nkv = model->meta.nkvh; + size_t dh = model->meta.dh; + size_t di = model->meta.di; + + // input layer norm weight [hs] + if (str_contains_any(sname, {"input_layernorm.weight", "ln_1.weight", "layernorm_before.weight", "attention_layernorm.weight"})) { + std::vector exp = {hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_norm_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_norm_w["< exp = {nh * dh, hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_q_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_q_w["< exp = {nh * dh}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_q_b[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_q_b["< exp = {nkv * dh, hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_k_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_k_w["< exp = {nkv * dh}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_k_b[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_k_b["< exp = {nkv * dh, hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_v_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_v_w["< exp = {nkv * dh}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_v_b[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_v_b["< exp = {hs, nh * dh}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_o_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: attn_o_w["< exp = {hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_norm_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: mlp_norm_w["< exp = {di, hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_gate_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: mlp_gate_w["< exp = {di, hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_up_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: mlp_up_w["< exp = {hs, di}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_down_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: mlp_down_w["<meta.nlayer; + if (n > 0) { + if (!model->weights.attn_norm_w) model->weights.attn_norm_w = new llaisysTensor_t[n](); + if (!model->weights.attn_q_w) model->weights.attn_q_w = new llaisysTensor_t[n](); + if (!model->weights.attn_q_b) model->weights.attn_q_b = new llaisysTensor_t[n](); + if (!model->weights.attn_k_w) model->weights.attn_k_w = new llaisysTensor_t[n](); + if (!model->weights.attn_k_b) model->weights.attn_k_b = new llaisysTensor_t[n](); + if (!model->weights.attn_v_w) model->weights.attn_v_w = new llaisysTensor_t[n](); + if (!model->weights.attn_v_b) model->weights.attn_v_b = new llaisysTensor_t[n](); + if (!model->weights.attn_o_w) model->weights.attn_o_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_norm_w) model->weights.mlp_norm_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_gate_w) model->weights.mlp_gate_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_up_w) model->weights.mlp_up_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_down_w) model->weights.mlp_down_w = new llaisysTensor_t[n](); + } + + // quick validation: warn about missing critical weights + std::vector missing; + if (!model->weights.in_embed) missing.push_back("in_embed"); + if (!model->weights.out_norm_w) missing.push_back("out_norm_w"); + if (!model->weights.out_embed) missing.push_back("out_embed"); + if (n > 0) { + for (size_t i = 0; i < n; ++i) { + size_t hs = model->meta.hs; + size_t nh = model->meta.nh; + size_t nkv = model->meta.nkvh; + size_t dh = model->meta.dh; + size_t di = model->meta.di; + + std::vector qw = {nh * dh, hs}; + std::vector kw = {nkv * dh, hs}; + std::vector vw = {nkv * dh, hs}; + std::vector ow = {hs, nh * dh}; + std::vector normv = {hs}; + std::vector gatew = {di, hs}; + std::vector downw = {hs, di}; + + if (!model->weights.attn_q_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_q_w[i], qw, model->meta.dtype)) missing.push_back("attn_q_w[" + std::to_string(i) + "]"); + if (!model->weights.attn_k_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_k_w[i], kw, model->meta.dtype)) missing.push_back("attn_k_w[" + std::to_string(i) + "]"); + if (!model->weights.attn_v_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_v_w[i], vw, model->meta.dtype)) missing.push_back("attn_v_w[" + std::to_string(i) + "]"); + if (!model->weights.attn_o_w[i] || !tensor_matches_shape_and_dtype(model->weights.attn_o_w[i], ow, model->meta.dtype)) missing.push_back("attn_o_w[" + std::to_string(i) + "]"); + if (!model->weights.mlp_norm_w[i] || !tensor_matches_shape_and_dtype(model->weights.mlp_norm_w[i], normv, model->meta.dtype)) missing.push_back("mlp_norm_w[" + std::to_string(i) + "]"); + if (!model->weights.mlp_gate_w[i] || !tensor_matches_shape_and_dtype(model->weights.mlp_gate_w[i], gatew, model->meta.dtype)) missing.push_back("mlp_gate_w[" + std::to_string(i) + "]"); + if (!model->weights.mlp_down_w[i] || !tensor_matches_shape_and_dtype(model->weights.mlp_down_w[i], downw, model->meta.dtype)) missing.push_back("mlp_down_w[" + std::to_string(i) + "]"); + } + } + if (!missing.empty()) { + std::cerr << "[llaisys qwen2] Warning: missing weights:"; + for (auto &s : missing) std::cerr << " " << s; + std::cerr << std::endl; + } + + return 0; +} + +// Simple KV cache structure and APIs +struct KVCache { + size_t max_tokens; + std::vector keys; + std::vector vals; +}; + +__export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens) { + KVCache *kv = new KVCache(); + kv->max_tokens = max_tokens; + kv->keys.reserve(max_tokens); + kv->vals.reserve(max_tokens); + if (model) model->kv_cache = kv; + return (void *)kv; +} + +__export void llaisysQwen2KVDestroy(void *kv) { + if (!kv) return; + KVCache *c = (KVCache *)kv; + c->keys.clear(); + c->vals.clear(); + delete c; +} + +__export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v) { + if (!kv) return -1; + KVCache *c = (KVCache *)kv; + if (c->keys.size() >= c->max_tokens) return -1; + c->keys.push_back(k); + c->vals.push_back(v); + return 0; +} + +__export size_t llaisysQwen2KVLen(void *kv) { + if (!kv) return 0; + KVCache *c = (KVCache *)kv; + return c->keys.size(); +} + +__export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name) { + if (!model || !name) return 0; + std::string sname(name); + auto it = model->weight_map.find(sname); + return it != model->weight_map.end() ? 1 : 0; +} + +__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) { + // More complete per-layer feedforward inference (attention skipped): + if (!model) return -1; + std::cerr << "[llaisys qwen2] infer entry" << std::endl; + if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token; + + int64_t last = token_ids[ntoken - 1]; + + // require embedding and out projection + if (!model->weights.in_embed || !model->weights.out_embed) { + return last; + } + + using namespace llaisys; + + // create index tensor [1] + std::vector idx_shape = {1}; + auto idx_tensor = Tensor::create(idx_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *idx = new LlaisysTensor{idx_tensor}; + idx->tensor->load(&last); + + // embedding -> x [1, hs] + std::vector emb_shape = {1, model->meta.hs}; + auto x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *x = new LlaisysTensor{x_tensor}; + llaisysEmbedding(x, idx, model->weights.in_embed); + std::cerr << "[llaisys qwen2] after embedding" << std::endl; + + // per-layer processing + size_t n = model->meta.nlayer; + for (size_t i = 0; i < n; ++i) { + std::cerr << "[llaisys qwen2] layer " << i << " start" << std::endl; + // optional rms_norm -> normed [1, hs] + LlaisysTensor *norm = nullptr; + if (model->weights.attn_norm_w && model->weights.attn_norm_w[i]) { + auto norm_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + norm = new LlaisysTensor{norm_tensor}; + llaisysRmsNorm(norm, x, model->weights.attn_norm_w[i], model->meta.epsilon); + } + + // Attention: compute q,k,v, append to KV and run self-attention + bool has_attn = model->weights.attn_norm_w && model->weights.attn_q_w && model->weights.attn_k_w && model->weights.attn_v_w && model->weights.attn_o_w && + model->weights.attn_norm_w[i] && model->weights.attn_q_w[i] && model->weights.attn_k_w[i] && model->weights.attn_v_w[i] && model->weights.attn_o_w[i]; + + if (has_attn) { + // q: project norm -> [1, nh*dh] then view [1, nh, dh] + size_t nh = model->meta.nh; + size_t dh = model->meta.dh; + size_t nkv = model->meta.nkvh; + + std::vector qflat_shape = {1, nh * dh}; + auto qflat_tensor = Tensor::create(qflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *qflat = new LlaisysTensor{qflat_tensor}; + llaisysLinear(qflat, norm ? norm : x, model->weights.attn_q_w[i], model->weights.attn_q_b && model->weights.attn_q_b[i] ? model->weights.attn_q_b[i] : nullptr); + // view to [1, nh, dh] + LlaisysTensor *q = nullptr; + try { + auto q_view = qflat->tensor->view({1, nh, dh}); + q = new LlaisysTensor{q_view}; + } catch (...) { + delete qflat; qflat = nullptr; + } + + // k: project norm -> [1, nkv*dh] then view [1, nkv, dh] + std::vector kflat_shape = {1, nkv * dh}; + auto kflat_tensor = Tensor::create(kflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *kflat = new LlaisysTensor{kflat_tensor}; + llaisysLinear(kflat, norm ? norm : x, model->weights.attn_k_w[i], model->weights.attn_k_b && model->weights.attn_k_b[i] ? model->weights.attn_k_b[i] : nullptr); + LlaisysTensor *k = nullptr; + try { + auto k_view = kflat->tensor->view({1, nkv, dh}); + k = new LlaisysTensor{k_view}; + } catch (...) { + delete kflat; kflat = nullptr; + } + + // v: project norm -> [1, nkv*dh] then view [1, nkv, dh] + std::vector vflat_shape = {1, nkv * dh}; + auto vflat_tensor = Tensor::create(vflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *vflat = new LlaisysTensor{vflat_tensor}; + llaisysLinear(vflat, norm ? norm : x, model->weights.attn_v_w[i], model->weights.attn_v_b && model->weights.attn_v_b[i] ? model->weights.attn_v_b[i] : nullptr); + LlaisysTensor *v = nullptr; + try { + auto v_view = vflat->tensor->view({1, nkv, dh}); + v = new LlaisysTensor{v_view}; + } catch (...) { + delete vflat; vflat = nullptr; + } + + // append k/v to kv cache if exists (or create ephemeral vectors) + KVCache *kv = model->kv_cache; + if (kv) { + // note: KVAppend stores handles, we transfer ownership semantics to KVCache (do not delete appended tensors here) + llaisysQwen2KVAppend(kv, k, v); + } + + // build k_all and v_all from KV entries + size_t total_len = 1; + std::vector kv_keys; + std::vector kv_vals; + if (kv && !kv->keys.empty()) { + total_len = kv->keys.size(); + kv_keys = kv->keys; + kv_vals = kv->vals; + } else { + total_len = 1; + kv_keys = {k}; + kv_vals = {v}; + } + + // create k_all [total_len, nkv, dh], v_all [total_len, nkv, dh] + std::vector k_all_shape = {total_len, nkv, dh}; + auto k_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + auto v_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *k_all = new LlaisysTensor{k_all_t}; + LlaisysTensor *v_all = new LlaisysTensor{v_all_t}; + + // copy each stored key/val into k_all/v_all + for (size_t j = 0; j < total_len; ++j) { + // compute byte size per entry + size_t elems = nkv * dh; + size_t bytes = elems * model->weights.in_embed->tensor->elementSize(); + // source pointers + const std::byte *src_k = kv_keys[j]->tensor->data(); + const std::byte *src_v = kv_vals[j]->tensor->data(); + std::byte *dst_k = k_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize(); + std::byte *dst_v = v_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize(); + // memcpy (works for CPU) + std::memcpy(dst_k, src_k, bytes); + std::memcpy(dst_v, src_v, bytes); + } + + // prepare attn_out [1, nh, dh] + std::vector attn_shape = {1, nh, dh}; + auto attn_t = Tensor::create(attn_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *attn_out = new LlaisysTensor{attn_t}; + + float scale = 1.0f / std::sqrt((float)dh); + llaisysSelfAttention(attn_out, q, k_all, v_all, scale); + + // flatten attn_out to [1, hs] and add to x + try { + auto attn_flat = attn_out->tensor->view({1, model->meta.hs}); + LlaisysTensor *attn_flat_t = new LlaisysTensor{attn_flat}; + auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor}; + llaisysAdd(new_x, x, attn_flat_t); + delete x; x = new_x; + delete attn_flat_t; + } catch (...) { + // ignore + } + + // cleanup temporaries we own (do not delete kv-stored tensors) + delete qflat; if (q) delete q; + if (!kv) { delete kflat; if (k) delete k; delete vflat; if (v) delete v; } + delete k_all; delete v_all; delete attn_out; + } + + // MLP: only run if the expected mlp weights are present + bool has_mlp = model->weights.mlp_norm_w && model->weights.mlp_gate_w && model->weights.mlp_up_w && model->weights.mlp_down_w && + model->weights.mlp_norm_w[i] && model->weights.mlp_gate_w[i] && model->weights.mlp_up_w[i] && model->weights.mlp_down_w[i]; + + if (has_mlp) { + std::cerr << "[llaisys qwen2] layer " << i << " mlp present" << std::endl; + auto mlp_in = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *mlp_in_t = new LlaisysTensor{mlp_in}; + llaisysRmsNorm(mlp_in_t, x, model->weights.mlp_norm_w[i], model->meta.epsilon); + + // gate [1, di] + std::vector gate_shape = {1, model->meta.di}; + auto gate_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *gate = new LlaisysTensor{gate_tensor}; + // up [1, di] + auto up_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *up = new LlaisysTensor{up_tensor}; + + // linear projections + llaisysLinear(gate, mlp_in_t, model->weights.mlp_gate_w[i], nullptr); + llaisysLinear(up, mlp_in_t, model->weights.mlp_up_w[i], nullptr); + + // swiglu -> act [1, di] + auto act_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *act = new LlaisysTensor{act_tensor}; + llaisysSwiGLU(act, gate, up); + + // down projection -> out [1, hs] + auto down_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *down = new LlaisysTensor{down_tensor}; + llaisysLinear(down, act, model->weights.mlp_down_w[i], nullptr); + + // residual: x = x + down + auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor}; + llaisysAdd(new_x, x, down); + + // swap x + delete x; x = new_x; + + // cleanup temporaries + delete mlp_in_t; + delete gate; + delete up; + delete act; + delete down; + std::cerr << "[llaisys qwen2] layer " << i << " mlp done" << std::endl; + } + + if (norm) delete norm; + std::cerr << "[llaisys qwen2] layer " << i << " end" << std::endl; + } + + // logits + std::cerr << "[llaisys qwen2] before logits" << std::endl; + std::cerr << "[llaisys qwen2] x ptr=" << x << " out_embed ptr=" << model->weights.out_embed << std::endl; + std::vector logits_shape = {1, model->meta.voc}; + auto logits_tensor = Tensor::create(logits_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *logits = new LlaisysTensor{logits_tensor}; + std::cerr << "[llaisys qwen2] calling llaisysLinear for logits" << std::endl; + llaisysLinear(logits, x, model->weights.out_embed, nullptr); + std::cerr << "[llaisys qwen2] after llaisysLinear for logits" << std::endl; + + // argmax + std::vector one_shape = {1}; + auto max_idx_t = Tensor::create(one_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0); + auto max_val_t = Tensor::create(one_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); + LlaisysTensor *max_idx = new LlaisysTensor{max_idx_t}; + LlaisysTensor *max_val = new LlaisysTensor{max_val_t}; + std::cerr << "[llaisys qwen2] calling argmax" << std::endl; + llaisysArgmax(max_idx, max_val, logits); + std::cerr << "[llaisys qwen2] after argmax" << std::endl; + + int64_t *res_ptr = reinterpret_cast(max_idx->tensor->data()); + int64_t next = res_ptr ? res_ptr[0] : model->meta.end_token; + + // cleanup + delete idx; + delete x; + delete logits; + delete max_idx; + delete max_val; + + return next; +} + +} // extern "C" + +// end of file diff --git a/xmake.lua b/xmake.lua index 1f65f7a95..095d96875 100644 --- a/xmake.lua +++ b/xmake.lua @@ -106,6 +106,7 @@ target("llaisys") set_languages("cxx17") set_warnings("all", "error") add_files("src/llaisys/*.cc") + add_files("src/llaisys/qwen2.cc") set_installdir(".") From 4c3f0e63a4c0aa6eb301bf515f34a43efe725da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= Date: Wed, 28 Jan 2026 15:00:04 +0800 Subject: [PATCH 5/9] fix(qwen2): include for std::sqrt on Linux --- src/llaisys/qwen2.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index b57e1ab6c..1c8082415 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -1,6 +1,7 @@ #include "llaisys/models/qwen2.h" #include "llaisys/tensor.h" #include "llaisys_tensor.hpp" +#include #include #include #include From 54c3ba7879ce6d00d81ac6147225ff17c54f5ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E6=80=A1=E4=BD=B3?= Date: Sun, 8 Mar 2026 20:27:30 +0800 Subject: [PATCH 6/9] feat: add nvidia op kernels and runtime integration --- python/llaisys/models/qwen2.py | 123 ++++- ...nvidia_resource.cu => nvidia_resource.cpp} | 1 + src/device/nvidia/nvidia_runtime_api.cpp | 123 +++++ src/device/nvidia/nvidia_runtime_api.cu | 75 --- src/llaisys/qwen2.cc | 458 ++++++++++-------- src/ops/add/nvidia/add_nvidia.cu | 42 ++ src/ops/add/nvidia/add_nvidia.hpp | 9 + src/ops/add/op.cpp | 6 +- src/ops/argmax/nvidia/argmax_nvidia.cu | 77 +++ src/ops/argmax/nvidia/argmax_nvidia.hpp | 9 + src/ops/argmax/op.cpp | 19 +- src/ops/embedding/nvidia/embedding_nvidia.cu | 65 +++ src/ops/embedding/nvidia/embedding_nvidia.hpp | 15 + src/ops/embedding/op.cpp | 18 +- src/ops/linear/nvidia/linear_nvidia.cu | 139 ++++++ src/ops/linear/nvidia/linear_nvidia.hpp | 17 + src/ops/linear/op.cpp | 19 +- src/ops/nvidia_cuda.cuh | 77 +++ src/ops/rearrange/nvidia/rearrange_nvidia.cu | 108 +++++ src/ops/rearrange/nvidia/rearrange_nvidia.hpp | 16 + src/ops/rearrange/op.cpp | 18 +- src/ops/rms_norm/nvidia/rms_norm_nvidia.cu | 85 ++++ src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp | 16 + src/ops/rms_norm/op.cpp | 18 +- src/ops/rope/nvidia/rope_nvidia.cu | 92 ++++ src/ops/rope/nvidia/rope_nvidia.hpp | 17 + src/ops/rope/op.cpp | 18 +- .../nvidia/self_attention_nvidia.cu | 149 ++++++ .../nvidia/self_attention_nvidia.hpp | 21 + src/ops/self_attention/op.cpp | 20 +- src/ops/swiglu/nvidia/swiglu_nvidia.cu | 53 ++ src/ops/swiglu/nvidia/swiglu_nvidia.hpp | 14 + src/ops/swiglu/op.cpp | 18 +- test/ops/self_attention.py | 2 +- xmake.lua | 8 +- xmake/nvidia.lua | 36 ++ 36 files changed, 1670 insertions(+), 331 deletions(-) rename src/device/nvidia/{nvidia_resource.cu => nvidia_resource.cpp} (86%) create mode 100644 src/device/nvidia/nvidia_runtime_api.cpp delete mode 100644 src/device/nvidia/nvidia_runtime_api.cu create mode 100644 src/ops/add/nvidia/add_nvidia.cu create mode 100644 src/ops/add/nvidia/add_nvidia.hpp create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.cu create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.hpp create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.cu create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.hpp create mode 100644 src/ops/linear/nvidia/linear_nvidia.cu create mode 100644 src/ops/linear/nvidia/linear_nvidia.hpp create mode 100644 src/ops/nvidia_cuda.cuh create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.cu create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.hpp create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.cu create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp create mode 100644 src/ops/rope/nvidia/rope_nvidia.cu create mode 100644 src/ops/rope/nvidia/rope_nvidia.hpp create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.cu create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.hpp create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.cu create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.hpp create mode 100644 xmake/nvidia.lua diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 033fcd9e3..aee1fb4b6 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -5,10 +5,13 @@ from pathlib import Path import safetensors -import numpy as np try: import torch +except Exception: + torch = None + +try: from transformers import AutoModelForCausalLM HF_AVAILABLE = True except Exception: @@ -19,6 +22,13 @@ class Qwen2: def __init__(self, model_path, device: DeviceType = DeviceType.CPU): model_path = Path(model_path) self._backend_model = None + self._backend_kv = None + self._device = DeviceType.CPU if device == DeviceType.CPU else DeviceType.NVIDIA + self._device_id = 0 + self._weight_tensors = [] + self._maxseq = 2048 + self._end_token = -1 + self._dtype = DataType.F32 # attempt to create backend model try: @@ -32,7 +42,14 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU): cfg = json.load(f) meta = LlaisysQwen2Meta() - meta.dtype = DataType.BF16 + dtype_str = str(cfg.get("torch_dtype", "bfloat16")).lower() + if "bfloat16" in dtype_str: + self._dtype = DataType.BF16 + elif "float16" in dtype_str or "half" in dtype_str: + self._dtype = DataType.F16 + else: + self._dtype = DataType.F32 + meta.dtype = self._dtype # robustly extract fields from config meta.nlayer = int(cfg.get("num_hidden_layers", cfg.get("n_layer", cfg.get("num_layers", 0)))) meta.hs = int(cfg.get("hidden_size", cfg.get("d_model", 0))) @@ -47,41 +64,62 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU): meta.epsilon = float(cfg.get("layer_norm_eps", cfg.get("eps", 1e-5))) meta.theta = float(cfg.get("rope_theta", cfg.get("theta", 10000.0))) meta.end_token = int(cfg.get("eos_token_id", cfg.get("end_token", -1))) + self._maxseq = int(meta.maxseq) + self._end_token = int(meta.end_token) - dev = DeviceType.CPU if device == DeviceType.CPU else DeviceType.NVIDIA import ctypes - self._backend_model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), dev, None, 0) + self._backend_model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), self._device, None, 0) + + if torch is None: + raise RuntimeError("PyTorch is required to load safetensors weights") + + if self._dtype == DataType.BF16: + target_torch_dtype = torch.bfloat16 + elif self._dtype == DataType.F16: + target_torch_dtype = torch.float16 + else: + target_torch_dtype = torch.float32 for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") + data_ = safetensors.safe_open(file, framework="pt", device="cpu") for name_ in data_.keys(): - arr = data_.get_tensor(name_) - if not arr.flags["C_CONTIGUOUS"]: - arr = np.ascontiguousarray(arr) - if arr.dtype != np.float32: - arr = arr.astype(np.float32) - - t = Tensor(shape=arr.shape, dtype=DataType.F32, device=DeviceType.CPU) - t.load(arr.ctypes.data) + ten = data_.get_tensor(name_).detach().cpu().contiguous() + if ten.dtype != target_torch_dtype: + ten = ten.to(target_torch_dtype) + + t = Tensor(shape=ten.shape, dtype=self._dtype, device=self._device, device_id=self._device_id) + t.load(ten.data_ptr()) LIB_LLAISYS.llaisysQwen2ModelSetWeight(self._backend_model, name_.encode("utf-8"), t.lib_tensor()) + self._weight_tensors.append(t) LIB_LLAISYS.llaisysQwen2ModelFinalize(self._backend_model) + self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, self._maxseq) # verify some required weights exist (best-effort) - required = [b"model.norm.weight", b"embed_tokens.weight", b"lm_head.weight"] + required_groups = [ + [b"model.norm.weight", b"norm.weight"], + [b"model.embed_tokens.weight", b"embed_tokens.weight"], + [b"lm_head.weight"], + ] missing = [] - for r in required: - try: - has = LIB_LLAISYS.llaisysQwen2ModelHasWeight(self._backend_model, r) - except Exception: - has = 0 - if not has: - missing.append(r.decode("utf-8")) + for group in required_groups: + found = False + for r in group: + try: + has = LIB_LLAISYS.llaisysQwen2ModelHasWeight(self._backend_model, r) + except Exception: + has = 0 + if has: + found = True + break + if not found: + missing.append(group[0].decode("utf-8")) if missing: print("[llaisys qwen2] Warning: missing weights:", missing) except Exception as e: # backend unavailable or error during loading; fall back to HF + print(f"[llaisys qwen2] backend load failed: {e}") self._backend_model = None if self._backend_model is None: @@ -103,9 +141,40 @@ def generate( import ctypes from ctypes import c_int64, c_size_t - arr = (c_int64 * len(inputs))(*inputs) - out = LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(inputs))) - return [int(out)] + input_ids = [int(t) for t in inputs] + if not input_ids: + return [] + + if max_new_tokens is None: + max_new_tokens = 128 + max_new_tokens = int(max_new_tokens) + if max_new_tokens <= 0: + return input_ids + + kv_cap = len(input_ids) + max_new_tokens + if self._maxseq > 0: + kv_cap = min(kv_cap, self._maxseq) + kv_cap = max(kv_cap, 1) + + if self._backend_kv is not None: + LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv) + self._backend_kv = None + self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap)) + + arr = (c_int64 * len(input_ids))(*input_ids) + next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(input_ids)))) + + output_ids = list(input_ids) + for _ in range(max_new_tokens): + if next_token is None: + break + output_ids.append(next_token) + if self._end_token >= 0 and next_token == self._end_token: + break + arr = (c_int64 * 1)(next_token) + next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(1))) + + return output_ids input_ids = torch.tensor([list(inputs)], dtype=torch.long, device=self.device) with torch.no_grad(): @@ -119,6 +188,12 @@ def generate( return outputs[0].tolist() def __del__(self): + try: + if self._backend_kv is not None: + LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv) + self._backend_kv = None + except Exception: + pass try: if self._backend_model is not None: LIB_LLAISYS.llaisysQwen2ModelDestroy(self._backend_model) diff --git a/src/device/nvidia/nvidia_resource.cu b/src/device/nvidia/nvidia_resource.cpp similarity index 86% rename from src/device/nvidia/nvidia_resource.cu rename to src/device/nvidia/nvidia_resource.cpp index 2e63647e5..84c59c6fb 100644 --- a/src/device/nvidia/nvidia_resource.cu +++ b/src/device/nvidia/nvidia_resource.cpp @@ -3,5 +3,6 @@ namespace llaisys::device::nvidia { Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {} +Resource::~Resource() = default; } // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_runtime_api.cpp b/src/device/nvidia/nvidia_runtime_api.cpp new file mode 100644 index 000000000..53da4bf77 --- /dev/null +++ b/src/device/nvidia/nvidia_runtime_api.cpp @@ -0,0 +1,123 @@ +#include "../runtime_api.hpp" + +#include +#include + +#include + +namespace llaisys::device::nvidia { + +namespace runtime_api { +static inline void check_cuda(cudaError_t err, const char *msg) { + if (err != cudaSuccess) { + std::cerr << "[CUDA ERROR] " << msg << ": " << cudaGetErrorString(err) << std::endl; + throw std::runtime_error("CUDA runtime API failed"); + } +} + +static inline cudaMemcpyKind to_cuda_memcpy_kind(llaisysMemcpyKind_t kind) { + switch (kind) { + case LLAISYS_MEMCPY_H2H: + return cudaMemcpyHostToHost; + case LLAISYS_MEMCPY_H2D: + return cudaMemcpyHostToDevice; + case LLAISYS_MEMCPY_D2H: + return cudaMemcpyDeviceToHost; + case LLAISYS_MEMCPY_D2D: + return cudaMemcpyDeviceToDevice; + default: + throw std::invalid_argument("Unsupported memcpy kind"); + } +} + +int getDeviceCount() { + int count = 0; + cudaError_t err = cudaGetDeviceCount(&count); + if (err == cudaErrorNoDevice || err == cudaErrorInsufficientDriver) { + return 0; + } + check_cuda(err, "cudaGetDeviceCount"); + return count; +} + +void setDevice(int device_id) { + check_cuda(cudaSetDevice(device_id), "cudaSetDevice"); +} + +void deviceSynchronize() { + check_cuda(cudaDeviceSynchronize(), "cudaDeviceSynchronize"); +} + +llaisysStream_t createStream() { + cudaStream_t stream = nullptr; + check_cuda(cudaStreamCreate(&stream), "cudaStreamCreate"); + return reinterpret_cast(stream); +} + +void destroyStream(llaisysStream_t stream) { + if (stream == nullptr) { + return; + } + check_cuda(cudaStreamDestroy(reinterpret_cast(stream)), "cudaStreamDestroy"); +} + +void streamSynchronize(llaisysStream_t stream) { + check_cuda(cudaStreamSynchronize(reinterpret_cast(stream)), "cudaStreamSynchronize"); +} + +void *mallocDevice(size_t size) { + void *ptr = nullptr; + check_cuda(cudaMalloc(&ptr, size), "cudaMalloc"); + return ptr; +} + +void freeDevice(void *ptr) { + if (ptr == nullptr) { + return; + } + check_cuda(cudaFree(ptr), "cudaFree"); +} + +void *mallocHost(size_t size) { + void *ptr = nullptr; + check_cuda(cudaMallocHost(&ptr, size), "cudaMallocHost"); + return ptr; +} + +void freeHost(void *ptr) { + if (ptr == nullptr) { + return; + } + check_cuda(cudaFreeHost(ptr), "cudaFreeHost"); +} + +void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { + check_cuda(cudaMemcpy(dst, src, size, to_cuda_memcpy_kind(kind)), "cudaMemcpy"); +} + +void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) { + check_cuda( + cudaMemcpyAsync(dst, src, size, to_cuda_memcpy_kind(kind), reinterpret_cast(stream)), + "cudaMemcpyAsync"); +} + +static const LlaisysRuntimeAPI RUNTIME_API = { + &getDeviceCount, + &setDevice, + &deviceSynchronize, + &createStream, + &destroyStream, + &streamSynchronize, + &mallocDevice, + &freeDevice, + &mallocHost, + &freeHost, + &memcpySync, + &memcpyAsync}; + +} // namespace runtime_api + +const LlaisysRuntimeAPI *getRuntimeAPI() { + return &runtime_api::RUNTIME_API; +} +} // namespace llaisys::device::nvidia diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu deleted file mode 100644 index cab928261..000000000 --- a/src/device/nvidia/nvidia_runtime_api.cu +++ /dev/null @@ -1,75 +0,0 @@ -#include "../runtime_api.hpp" - -#include -#include - -namespace llaisys::device::nvidia { - -namespace runtime_api { -int getDeviceCount() { - TO_BE_IMPLEMENTED(); -} - -void setDevice(int) { - TO_BE_IMPLEMENTED(); -} - -void deviceSynchronize() { - TO_BE_IMPLEMENTED(); -} - -llaisysStream_t createStream() { - TO_BE_IMPLEMENTED(); -} - -void destroyStream(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); -} -void streamSynchronize(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); -} - -void *mallocDevice(size_t size) { - TO_BE_IMPLEMENTED(); -} - -void freeDevice(void *ptr) { - TO_BE_IMPLEMENTED(); -} - -void *mallocHost(size_t size) { - TO_BE_IMPLEMENTED(); -} - -void freeHost(void *ptr) { - TO_BE_IMPLEMENTED(); -} - -void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); -} - -void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); -} - -static const LlaisysRuntimeAPI RUNTIME_API = { - &getDeviceCount, - &setDevice, - &deviceSynchronize, - &createStream, - &destroyStream, - &streamSynchronize, - &mallocDevice, - &freeDevice, - &mallocHost, - &freeHost, - &memcpySync, - &memcpyAsync}; - -} // namespace runtime_api - -const LlaisysRuntimeAPI *getRuntimeAPI() { - return &runtime_api::RUNTIME_API; -} -} // namespace llaisys::device::nvidia diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index 1c8082415..4166fe18f 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -9,9 +9,12 @@ #include #include #include "llaisys/ops.h" +#include "../core/llaisys_core.hpp" struct LlaisysQwen2Model { LlaisysQwen2Meta meta; + llaisysDeviceType_t device_type = LLAISYS_DEVICE_CPU; + int device_id = 0; LlaisysQwen2Weights weights; // store arbitrary named tensors provided from python std::unordered_map weight_map; @@ -26,12 +29,18 @@ __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Met if (!meta) return nullptr; LlaisysQwen2Model *m = new LlaisysQwen2Model(); m->meta = *meta; + m->device_type = device; + m->device_id = (device_ids != nullptr && ndevice > 0) ? device_ids[0] : 0; memset(&m->weights, 0, sizeof(m->weights)); return m; } __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) { if (!model) return; + if (model->kv_cache) { + llaisysQwen2KVDestroy((void *)model->kv_cache); + model->kv_cache = nullptr; + } model->weight_map.clear(); if (model->weights.attn_norm_w) delete[] model->weights.attn_norm_w; if (model->weights.attn_q_w) delete[] model->weights.attn_q_w; @@ -71,8 +80,29 @@ static bool tensor_matches_shape_and_dtype(llaisysTensor_t t, const std::vector< } } +static void ensure_layer_weight_arrays(LlaisysQwen2Model *model) { + if (!model || model->meta.nlayer == 0) { + return; + } + + size_t n = model->meta.nlayer; + if (!model->weights.attn_norm_w) model->weights.attn_norm_w = new llaisysTensor_t[n](); + if (!model->weights.attn_q_w) model->weights.attn_q_w = new llaisysTensor_t[n](); + if (!model->weights.attn_q_b) model->weights.attn_q_b = new llaisysTensor_t[n](); + if (!model->weights.attn_k_w) model->weights.attn_k_w = new llaisysTensor_t[n](); + if (!model->weights.attn_k_b) model->weights.attn_k_b = new llaisysTensor_t[n](); + if (!model->weights.attn_v_w) model->weights.attn_v_w = new llaisysTensor_t[n](); + if (!model->weights.attn_v_b) model->weights.attn_v_b = new llaisysTensor_t[n](); + if (!model->weights.attn_o_w) model->weights.attn_o_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_norm_w) model->weights.mlp_norm_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_gate_w) model->weights.mlp_gate_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_up_w) model->weights.mlp_up_w = new llaisysTensor_t[n](); + if (!model->weights.mlp_down_w) model->weights.mlp_down_w = new llaisysTensor_t[n](); +} + __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor) { if (!model || !name) return -1; + ensure_layer_weight_arrays(model); std::string sname(name); model->weight_map[sname] = tensor; @@ -89,7 +119,7 @@ __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const else std::cerr << "[llaisys qwen2] Warning: out_embed shape/dtype mismatch for " << sname << std::endl; return 0; } - if (str_contains_any(sname, {"model.norm.weight", "ln_f.weight", "final_layernorm.weight", "out_norm.weight", "norm.weight"})) { + if (sname == "norm.weight" || str_contains_any(sname, {"model.norm.weight", "ln_f.weight", "final_layernorm.weight", "out_norm.weight"})) { // avoid matching layer-norm inside layers by checking common patterns std::vector exp = {model->meta.hs}; if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.out_norm_w = tensor; @@ -116,8 +146,17 @@ __export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const size_t dh = model->meta.dh; size_t di = model->meta.di; + // post-attention norm + if (str_contains_any(sname, {"post_attention_layernorm.weight", "ln_2.weight", "layernorm_after.weight"})) { + std::vector exp = {hs}; + if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_norm_w[idx] = tensor; + else std::cerr << "[llaisys qwen2] Warning: mlp_norm_w["< exp = {hs}; if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.attn_norm_w[idx] = tensor; else std::cerr << "[llaisys qwen2] Warning: attn_norm_w["< exp = {hs}; - if (tensor_matches_shape_and_dtype(tensor, exp, model->meta.dtype)) model->weights.mlp_norm_w[idx] = tensor; - else std::cerr << "[llaisys qwen2] Warning: mlp_norm_w["< exp = {di, hs}; @@ -265,41 +296,91 @@ __export int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model) { // Simple KV cache structure and APIs struct KVCache { - size_t max_tokens; - std::vector keys; - std::vector vals; + LlaisysQwen2Model *owner = nullptr; + size_t max_tokens = 0; + size_t nlayer = 0; + std::vector> keys; + std::vector> vals; }; +static llaisysTensor_t make_tensor(const std::vector &shape, llaisysDataType_t dtype, llaisysDeviceType_t device, int device_id) { + return new LlaisysTensor{llaisys::Tensor::create(shape, dtype, device, device_id)}; +} + +static void clear_layer_cache(std::vector &cache) { + for (auto *t : cache) { + delete t; + } + cache.clear(); +} + +static void kv_append_for_layer(KVCache *kv, size_t layer_idx, llaisysTensor_t k, llaisysTensor_t v) { + if (!kv || layer_idx >= kv->nlayer || !k || !v) { + return; + } + auto &ks = kv->keys[layer_idx]; + auto &vs = kv->vals[layer_idx]; + + if (kv->max_tokens > 0 && ks.size() >= kv->max_tokens) { + delete ks.front(); + ks.erase(ks.begin()); + delete vs.front(); + vs.erase(vs.begin()); + } + + ks.push_back(k); + vs.push_back(v); +} + __export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens) { + if (!model) return nullptr; + + if (model->kv_cache) { + llaisysQwen2KVDestroy((void *)model->kv_cache); + } + KVCache *kv = new KVCache(); + kv->owner = model; kv->max_tokens = max_tokens; - kv->keys.reserve(max_tokens); - kv->vals.reserve(max_tokens); - if (model) model->kv_cache = kv; + kv->nlayer = model->meta.nlayer; + kv->keys.resize(kv->nlayer); + kv->vals.resize(kv->nlayer); + for (size_t i = 0; i < kv->nlayer; ++i) { + kv->keys[i].reserve(max_tokens); + kv->vals[i].reserve(max_tokens); + } + model->kv_cache = kv; return (void *)kv; } __export void llaisysQwen2KVDestroy(void *kv) { if (!kv) return; KVCache *c = (KVCache *)kv; + for (size_t i = 0; i < c->nlayer; ++i) { + clear_layer_cache(c->keys[i]); + clear_layer_cache(c->vals[i]); + } c->keys.clear(); c->vals.clear(); + if (c->owner && c->owner->kv_cache == c) { + c->owner->kv_cache = nullptr; + } delete c; } __export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v) { if (!kv) return -1; KVCache *c = (KVCache *)kv; - if (c->keys.size() >= c->max_tokens) return -1; - c->keys.push_back(k); - c->vals.push_back(v); + if (c->nlayer == 0) return -1; + kv_append_for_layer(c, 0, k, v); return 0; } __export size_t llaisysQwen2KVLen(void *kv) { if (!kv) return 0; KVCache *c = (KVCache *)kv; - return c->keys.size(); + if (c->nlayer == 0) return 0; + return c->keys[0].size(); } __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name) { @@ -309,244 +390,188 @@ __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, co return it != model->weight_map.end() ? 1 : 0; } -__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) { - // More complete per-layer feedforward inference (attention skipped): - if (!model) return -1; - std::cerr << "[llaisys qwen2] infer entry" << std::endl; - if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token; - - int64_t last = token_ids[ntoken - 1]; +static int64_t infer_one_token(struct LlaisysQwen2Model *model, int64_t token_id) { + using namespace llaisys; - // require embedding and out projection if (!model->weights.in_embed || !model->weights.out_embed) { - return last; + return token_id; } - using namespace llaisys; + llaisys::core::context().setDevice(model->device_type, model->device_id); + const LlaisysRuntimeAPI *runtime_api = llaisys::core::context().runtime().api(); + + const llaisysDataType_t dtype = model->meta.dtype; + const size_t hs = model->meta.hs; + const size_t nh = model->meta.nh; + const size_t nkvh = model->meta.nkvh; + const size_t dh = model->meta.dh; + const size_t di = model->meta.di; + const std::vector one_shape = {1}; + const std::vector hidden_shape = {1, hs}; + const std::vector q_shape = {1, nh, dh}; + const std::vector kv_shape = {1, nkvh, dh}; - // create index tensor [1] - std::vector idx_shape = {1}; - auto idx_tensor = Tensor::create(idx_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *idx = new LlaisysTensor{idx_tensor}; - idx->tensor->load(&last); + auto idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id); + idx->tensor->load(&token_id); - // embedding -> x [1, hs] - std::vector emb_shape = {1, model->meta.hs}; - auto x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *x = new LlaisysTensor{x_tensor}; + auto x = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); llaisysEmbedding(x, idx, model->weights.in_embed); - std::cerr << "[llaisys qwen2] after embedding" << std::endl; - // per-layer processing - size_t n = model->meta.nlayer; - for (size_t i = 0; i < n; ++i) { - std::cerr << "[llaisys qwen2] layer " << i << " start" << std::endl; - // optional rms_norm -> normed [1, hs] - LlaisysTensor *norm = nullptr; - if (model->weights.attn_norm_w && model->weights.attn_norm_w[i]) { - auto norm_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - norm = new LlaisysTensor{norm_tensor}; - llaisysRmsNorm(norm, x, model->weights.attn_norm_w[i], model->meta.epsilon); - } + KVCache *kv = model->kv_cache; - // Attention: compute q,k,v, append to KV and run self-attention - bool has_attn = model->weights.attn_norm_w && model->weights.attn_q_w && model->weights.attn_k_w && model->weights.attn_v_w && model->weights.attn_o_w && - model->weights.attn_norm_w[i] && model->weights.attn_q_w[i] && model->weights.attn_k_w[i] && model->weights.attn_v_w[i] && model->weights.attn_o_w[i]; + for (size_t i = 0; i < model->meta.nlayer; ++i) { + bool has_attn = model->weights.attn_norm_w && model->weights.attn_q_w && model->weights.attn_k_w && model->weights.attn_v_w && + model->weights.attn_o_w && model->weights.attn_norm_w[i] && model->weights.attn_q_w[i] && + model->weights.attn_k_w[i] && model->weights.attn_v_w[i] && model->weights.attn_o_w[i]; if (has_attn) { - // q: project norm -> [1, nh*dh] then view [1, nh, dh] - size_t nh = model->meta.nh; - size_t dh = model->meta.dh; - size_t nkv = model->meta.nkvh; + auto norm1 = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); + llaisysRmsNorm(norm1, x, model->weights.attn_norm_w[i], model->meta.epsilon); - std::vector qflat_shape = {1, nh * dh}; - auto qflat_tensor = Tensor::create(qflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *qflat = new LlaisysTensor{qflat_tensor}; - llaisysLinear(qflat, norm ? norm : x, model->weights.attn_q_w[i], model->weights.attn_q_b && model->weights.attn_q_b[i] ? model->weights.attn_q_b[i] : nullptr); - // view to [1, nh, dh] - LlaisysTensor *q = nullptr; - try { - auto q_view = qflat->tensor->view({1, nh, dh}); - q = new LlaisysTensor{q_view}; - } catch (...) { - delete qflat; qflat = nullptr; - } + auto qflat = make_tensor({1, nh * dh}, dtype, model->device_type, model->device_id); + auto kflat = make_tensor({1, nkvh * dh}, dtype, model->device_type, model->device_id); + auto vflat = make_tensor({1, nkvh * dh}, dtype, model->device_type, model->device_id); - // k: project norm -> [1, nkv*dh] then view [1, nkv, dh] - std::vector kflat_shape = {1, nkv * dh}; - auto kflat_tensor = Tensor::create(kflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *kflat = new LlaisysTensor{kflat_tensor}; - llaisysLinear(kflat, norm ? norm : x, model->weights.attn_k_w[i], model->weights.attn_k_b && model->weights.attn_k_b[i] ? model->weights.attn_k_b[i] : nullptr); - LlaisysTensor *k = nullptr; - try { - auto k_view = kflat->tensor->view({1, nkv, dh}); - k = new LlaisysTensor{k_view}; - } catch (...) { - delete kflat; kflat = nullptr; - } + llaisysLinear(qflat, norm1, model->weights.attn_q_w[i], model->weights.attn_q_b ? model->weights.attn_q_b[i] : nullptr); + llaisysLinear(kflat, norm1, model->weights.attn_k_w[i], model->weights.attn_k_b ? model->weights.attn_k_b[i] : nullptr); + llaisysLinear(vflat, norm1, model->weights.attn_v_w[i], model->weights.attn_v_b ? model->weights.attn_v_b[i] : nullptr); - // v: project norm -> [1, nkv*dh] then view [1, nkv, dh] - std::vector vflat_shape = {1, nkv * dh}; - auto vflat_tensor = Tensor::create(vflat_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *vflat = new LlaisysTensor{vflat_tensor}; - llaisysLinear(vflat, norm ? norm : x, model->weights.attn_v_w[i], model->weights.attn_v_b && model->weights.attn_v_b[i] ? model->weights.attn_v_b[i] : nullptr); - LlaisysTensor *v = nullptr; - try { - auto v_view = vflat->tensor->view({1, nkv, dh}); - v = new LlaisysTensor{v_view}; - } catch (...) { - delete vflat; vflat = nullptr; + auto q = new LlaisysTensor{qflat->tensor->view(q_shape)}; + auto k = new LlaisysTensor{kflat->tensor->view(kv_shape)}; + auto v = new LlaisysTensor{vflat->tensor->view(kv_shape)}; + + int64_t pos = 0; + if (kv && i < kv->nlayer) { + pos = static_cast(kv->keys[i].size()); } - // append k/v to kv cache if exists (or create ephemeral vectors) - KVCache *kv = model->kv_cache; - if (kv) { - // note: KVAppend stores handles, we transfer ownership semantics to KVCache (do not delete appended tensors here) - llaisysQwen2KVAppend(kv, k, v); + auto pos_ids = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id); + pos_ids->tensor->load(&pos); + + auto q_rope = make_tensor(q_shape, dtype, model->device_type, model->device_id); + auto k_rope = make_tensor(kv_shape, dtype, model->device_type, model->device_id); + llaisysROPE(q_rope, q, pos_ids, model->meta.theta); + llaisysROPE(k_rope, k, pos_ids, model->meta.theta); + + bool saved_in_cache = false; + if (kv && i < kv->nlayer) { + kv_append_for_layer(kv, i, k_rope, v); + saved_in_cache = true; } - // build k_all and v_all from KV entries - size_t total_len = 1; - std::vector kv_keys; - std::vector kv_vals; - if (kv && !kv->keys.empty()) { - total_len = kv->keys.size(); - kv_keys = kv->keys; - kv_vals = kv->vals; + std::vector local_keys; + std::vector local_vals; + const std::vector *k_src = nullptr; + const std::vector *v_src = nullptr; + if (saved_in_cache) { + k_src = &kv->keys[i]; + v_src = &kv->vals[i]; } else { - total_len = 1; - kv_keys = {k}; - kv_vals = {v}; + local_keys.push_back(k_rope); + local_vals.push_back(v); + k_src = &local_keys; + v_src = &local_vals; } - // create k_all [total_len, nkv, dh], v_all [total_len, nkv, dh] - std::vector k_all_shape = {total_len, nkv, dh}; - auto k_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - auto v_all_t = Tensor::create(k_all_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *k_all = new LlaisysTensor{k_all_t}; - LlaisysTensor *v_all = new LlaisysTensor{v_all_t}; - - // copy each stored key/val into k_all/v_all + const size_t total_len = k_src->size(); + auto k_all = make_tensor({total_len, nkvh, dh}, dtype, model->device_type, model->device_id); + auto v_all = make_tensor({total_len, nkvh, dh}, dtype, model->device_type, model->device_id); + const size_t bytes_per_token = nkvh * dh * k_all->tensor->elementSize(); for (size_t j = 0; j < total_len; ++j) { - // compute byte size per entry - size_t elems = nkv * dh; - size_t bytes = elems * model->weights.in_embed->tensor->elementSize(); - // source pointers - const std::byte *src_k = kv_keys[j]->tensor->data(); - const std::byte *src_v = kv_vals[j]->tensor->data(); - std::byte *dst_k = k_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize(); - std::byte *dst_v = v_all->tensor->data() + j * elems * model->weights.in_embed->tensor->elementSize(); - // memcpy (works for CPU) - std::memcpy(dst_k, src_k, bytes); - std::memcpy(dst_v, src_v, bytes); + runtime_api->memcpy_sync(k_all->tensor->data() + j * bytes_per_token, (*k_src)[j]->tensor->data(), bytes_per_token, LLAISYS_MEMCPY_D2D); + runtime_api->memcpy_sync(v_all->tensor->data() + j * bytes_per_token, (*v_src)[j]->tensor->data(), bytes_per_token, LLAISYS_MEMCPY_D2D); } - // prepare attn_out [1, nh, dh] - std::vector attn_shape = {1, nh, dh}; - auto attn_t = Tensor::create(attn_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *attn_out = new LlaisysTensor{attn_t}; - - float scale = 1.0f / std::sqrt((float)dh); - llaisysSelfAttention(attn_out, q, k_all, v_all, scale); - - // flatten attn_out to [1, hs] and add to x - try { - auto attn_flat = attn_out->tensor->view({1, model->meta.hs}); - LlaisysTensor *attn_flat_t = new LlaisysTensor{attn_flat}; - auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor}; - llaisysAdd(new_x, x, attn_flat_t); - delete x; x = new_x; - delete attn_flat_t; - } catch (...) { - // ignore + auto attn_val = make_tensor(q_shape, dtype, model->device_type, model->device_id); + const float scale = 1.0f / std::sqrt(static_cast(dh)); + llaisysSelfAttention(attn_val, q_rope, k_all, v_all, scale); + + auto attn_flat = new LlaisysTensor{attn_val->tensor->view(hidden_shape)}; + auto attn_out = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); + llaisysLinear(attn_out, attn_flat, model->weights.attn_o_w[i], nullptr); + + auto x_next = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); + llaisysAdd(x_next, x, attn_out); + delete x; + x = x_next; + + delete norm1; + delete qflat; + delete kflat; + delete vflat; + delete q; + delete k; + delete pos_ids; + delete q_rope; + delete k_all; + delete v_all; + delete attn_val; + delete attn_flat; + delete attn_out; + + if (!saved_in_cache) { + delete k_rope; + delete v; } - - // cleanup temporaries we own (do not delete kv-stored tensors) - delete qflat; if (q) delete q; - if (!kv) { delete kflat; if (k) delete k; delete vflat; if (v) delete v; } - delete k_all; delete v_all; delete attn_out; } - // MLP: only run if the expected mlp weights are present bool has_mlp = model->weights.mlp_norm_w && model->weights.mlp_gate_w && model->weights.mlp_up_w && model->weights.mlp_down_w && - model->weights.mlp_norm_w[i] && model->weights.mlp_gate_w[i] && model->weights.mlp_up_w[i] && model->weights.mlp_down_w[i]; + model->weights.mlp_norm_w[i] && model->weights.mlp_gate_w[i] && model->weights.mlp_up_w[i] && + model->weights.mlp_down_w[i]; if (has_mlp) { - std::cerr << "[llaisys qwen2] layer " << i << " mlp present" << std::endl; - auto mlp_in = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *mlp_in_t = new LlaisysTensor{mlp_in}; - llaisysRmsNorm(mlp_in_t, x, model->weights.mlp_norm_w[i], model->meta.epsilon); - - // gate [1, di] - std::vector gate_shape = {1, model->meta.di}; - auto gate_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *gate = new LlaisysTensor{gate_tensor}; - // up [1, di] - auto up_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *up = new LlaisysTensor{up_tensor}; - - // linear projections - llaisysLinear(gate, mlp_in_t, model->weights.mlp_gate_w[i], nullptr); - llaisysLinear(up, mlp_in_t, model->weights.mlp_up_w[i], nullptr); - - // swiglu -> act [1, di] - auto act_tensor = Tensor::create(gate_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *act = new LlaisysTensor{act_tensor}; - llaisysSwiGLU(act, gate, up); + auto norm2 = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); + llaisysRmsNorm(norm2, x, model->weights.mlp_norm_w[i], model->meta.epsilon); - // down projection -> out [1, hs] - auto down_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *down = new LlaisysTensor{down_tensor}; - llaisysLinear(down, act, model->weights.mlp_down_w[i], nullptr); + auto gate = make_tensor({1, di}, dtype, model->device_type, model->device_id); + auto up = make_tensor({1, di}, dtype, model->device_type, model->device_id); + auto act = make_tensor({1, di}, dtype, model->device_type, model->device_id); + auto down = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); - // residual: x = x + down - auto new_x_tensor = Tensor::create(emb_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *new_x = new LlaisysTensor{new_x_tensor}; - llaisysAdd(new_x, x, down); + llaisysLinear(gate, norm2, model->weights.mlp_gate_w[i], nullptr); + llaisysLinear(up, norm2, model->weights.mlp_up_w[i], nullptr); + llaisysSwiGLU(act, gate, up); + llaisysLinear(down, act, model->weights.mlp_down_w[i], nullptr); - // swap x - delete x; x = new_x; + auto x_next = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); + llaisysAdd(x_next, x, down); + delete x; + x = x_next; - // cleanup temporaries - delete mlp_in_t; + delete norm2; delete gate; delete up; delete act; delete down; - std::cerr << "[llaisys qwen2] layer " << i << " mlp done" << std::endl; } + } - if (norm) delete norm; - std::cerr << "[llaisys qwen2] layer " << i << " end" << std::endl; + llaisysTensor_t logits_in = x; + llaisysTensor_t out_norm = nullptr; + if (model->weights.out_norm_w) { + out_norm = make_tensor(hidden_shape, dtype, model->device_type, model->device_id); + llaisysRmsNorm(out_norm, x, model->weights.out_norm_w, model->meta.epsilon); + logits_in = out_norm; } - // logits - std::cerr << "[llaisys qwen2] before logits" << std::endl; - std::cerr << "[llaisys qwen2] x ptr=" << x << " out_embed ptr=" << model->weights.out_embed << std::endl; - std::vector logits_shape = {1, model->meta.voc}; - auto logits_tensor = Tensor::create(logits_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *logits = new LlaisysTensor{logits_tensor}; - std::cerr << "[llaisys qwen2] calling llaisysLinear for logits" << std::endl; - llaisysLinear(logits, x, model->weights.out_embed, nullptr); - std::cerr << "[llaisys qwen2] after llaisysLinear for logits" << std::endl; - - // argmax - std::vector one_shape = {1}; - auto max_idx_t = Tensor::create(one_shape, LLAISYS_DTYPE_I64, LLAISYS_DEVICE_CPU, 0); - auto max_val_t = Tensor::create(one_shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU, 0); - LlaisysTensor *max_idx = new LlaisysTensor{max_idx_t}; - LlaisysTensor *max_val = new LlaisysTensor{max_val_t}; - std::cerr << "[llaisys qwen2] calling argmax" << std::endl; + auto logits = make_tensor({1, model->meta.voc}, dtype, model->device_type, model->device_id); + llaisysLinear(logits, logits_in, model->weights.out_embed, nullptr); + + auto max_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id); + auto max_val = make_tensor(one_shape, dtype, model->device_type, model->device_id); llaisysArgmax(max_idx, max_val, logits); - std::cerr << "[llaisys qwen2] after argmax" << std::endl; - int64_t *res_ptr = reinterpret_cast(max_idx->tensor->data()); - int64_t next = res_ptr ? res_ptr[0] : model->meta.end_token; + int64_t next = model->meta.end_token; + if (max_idx->tensor->deviceType() == LLAISYS_DEVICE_CPU) { + int64_t *res_ptr = reinterpret_cast(max_idx->tensor->data()); + next = res_ptr ? res_ptr[0] : model->meta.end_token; + } else { + runtime_api->memcpy_sync(&next, max_idx->tensor->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H); + } - // cleanup delete idx; delete x; + if (out_norm) delete out_norm; delete logits; delete max_idx; delete max_val; @@ -554,6 +579,17 @@ __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_ return next; } +__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) { + if (!model) return -1; + if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token; + + int64_t next = model->meta.end_token; + for (size_t i = 0; i < ntoken; ++i) { + next = infer_one_token(model, token_ids[i]); + } + return next; +} + } // extern "C" // end of file diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu new file mode 100644 index 000000000..372106a2f --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.cu @@ -0,0 +1,42 @@ +#include "add_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +namespace llaisys::ops::nvidia { + +template +__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } + c[idx] = from_float(to_float(a[idx]) + to_float(b[idx])); +} + +template +void launch_add(std::byte *c, const std::byte *a, const std::byte *b, size_t numel) { + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(numel, threads); + auto stream = current_stream(); + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + check_cuda(cudaGetLastError(), "add_kernel launch"); +} + +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_add(c, a, b, numel); + case LLAISYS_DTYPE_F16: + return launch_add(c, a, b, numel); + case LLAISYS_DTYPE_BF16: + return launch_add(c, a, b, numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/add/nvidia/add_nvidia.hpp b/src/ops/add/nvidia/add_nvidia.hpp new file mode 100644 index 000000000..2e67b497a --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp index a057330d7..7f7b40131 100644 --- a/src/ops/add/op.cpp +++ b/src/ops/add/op.cpp @@ -4,6 +4,9 @@ #include "../../utils.hpp" #include "cpu/add_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/add_nvidia.hpp" +#endif namespace llaisys::ops { void add(tensor_t c, tensor_t a, tensor_t b) { @@ -25,8 +28,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) { return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu new file mode 100644 index 000000000..bc426917c --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.cu @@ -0,0 +1,77 @@ +#include "argmax_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void argmax_kernel(int64_t *max_idx, T *max_val, const T *vals, size_t numel) { + constexpr int kBlockSize = 256; + __shared__ float s_val[kBlockSize]; + __shared__ int64_t s_idx[kBlockSize]; + + int tid = threadIdx.x; + + float local_val = -CUDART_INF_F; + int64_t local_idx = 0; + bool has = false; + + for (size_t i = static_cast(tid); i < numel; i += blockDim.x) { + float v = to_float(vals[i]); + if (!has || v > local_val || (v == local_val && static_cast(i) < local_idx)) { + local_val = v; + local_idx = static_cast(i); + has = true; + } + } + + s_val[tid] = local_val; + s_idx[tid] = local_idx; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + float rhs_val = s_val[tid + stride]; + int64_t rhs_idx = s_idx[tid + stride]; + if (rhs_val > s_val[tid] || (rhs_val == s_val[tid] && rhs_idx < s_idx[tid])) { + s_val[tid] = rhs_val; + s_idx[tid] = rhs_idx; + } + } + __syncthreads(); + } + + if (tid == 0) { + *max_idx = s_idx[0]; + *max_val = from_float(s_val[0]); + } +} + +template +void launch_argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t numel) { + constexpr int kBlockSize = 256; + auto stream = current_stream(); + argmax_kernel<<<1, kBlockSize, 0, stream>>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + check_cuda(cudaGetLastError(), "argmax_kernel launch"); +} + +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_argmax(max_idx, max_val, vals, numel); + case LLAISYS_DTYPE_F16: + return launch_argmax(max_idx, max_val, vals, numel); + case LLAISYS_DTYPE_BF16: + return launch_argmax(max_idx, max_val, vals, numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/argmax/nvidia/argmax_nvidia.hpp b/src/ops/argmax/nvidia/argmax_nvidia.hpp new file mode 100644 index 000000000..054fa353c --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 8cafa32de..ac962f269 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/argmax_cpu.hpp" // 记得包含刚才写的新头文件 +#ifdef ENABLE_NVIDIA_API +#include "nvidia/argmax_nvidia.hpp" +#endif namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { @@ -15,7 +18,17 @@ void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { vals->dtype(), vals->numel()); } - // 如果有其他设备,可以在这里继续 switch... - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } -} // namespace llaisys::ops \ No newline at end of file +} // namespace llaisys::ops diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu new file mode 100644 index 000000000..5ea8eeb55 --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.cu @@ -0,0 +1,65 @@ +#include "embedding_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +namespace llaisys::ops::nvidia { + +template +__global__ void embedding_kernel( + T *out, + const int64_t *index, + const T *weight, + size_t num_indices, + size_t embedding_dim) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = num_indices * embedding_dim; + if (idx >= total) { + return; + } + + size_t row = idx / embedding_dim; + size_t col = idx % embedding_dim; + int64_t src_row = index[row]; + out[idx] = weight[static_cast(src_row) * embedding_dim + col]; +} + +template +void launch_embedding( + std::byte *out, + const std::byte *index, + const std::byte *weight, + size_t num_indices, + size_t embedding_dim) { + const size_t total = num_indices * embedding_dim; + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(total, threads); + auto stream = current_stream(); + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + num_indices, + embedding_dim); + check_cuda(cudaGetLastError(), "embedding_kernel launch"); +} + +void embedding( + std::byte *out, + const std::byte *index, + const std::byte *weight, + llaisysDataType_t type, + size_t num_indices, + size_t embedding_dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_embedding(out, index, weight, num_indices, embedding_dim); + case LLAISYS_DTYPE_F16: + return launch_embedding(out, index, weight, num_indices, embedding_dim); + case LLAISYS_DTYPE_BF16: + return launch_embedding(out, index, weight, num_indices, embedding_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/embedding/nvidia/embedding_nvidia.hpp b/src/ops/embedding/nvidia/embedding_nvidia.hpp new file mode 100644 index 000000000..c77e9d152 --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void embedding( + std::byte *out, + const std::byte *index, + const std::byte *weight, + llaisysDataType_t type, + size_t num_indices, + size_t embedding_dim); +} diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 1008838f0..42b2b136d 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/embedding_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/embedding_nvidia.hpp" +#endif namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { @@ -25,6 +28,17 @@ void embedding(tensor_t out, tensor_t index, tensor_t weight) { out->dtype(), num_indices, embedding_dim); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), num_indices, embedding_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::embedding(out->data(), index->data(), weight->data(), out->dtype(), num_indices, embedding_dim); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } -} // namespace llaisys::ops \ No newline at end of file +} // namespace llaisys::ops diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu new file mode 100644 index 000000000..481570e74 --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.cu @@ -0,0 +1,139 @@ +#include "linear_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +#include + +namespace llaisys::ops::nvidia { + +inline void check_cublas(cublasStatus_t status, const char *msg) { + if (status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "[CUBLAS ERROR] " << msg << ": " << static_cast(status) << std::endl; + throw std::runtime_error("cuBLAS API failed"); + } +} + +static thread_local cublasHandle_t TL_HANDLE = nullptr; + +cublasHandle_t get_cublas_handle() { + if (TL_HANDLE == nullptr) { + check_cublas(cublasCreate(&TL_HANDLE), "cublasCreate"); + } + return TL_HANDLE; +} + +template +__global__ void add_bias_kernel(T *out, const T *bias, size_t M, size_t N) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = M * N; + if (idx >= total) { + return; + } + size_t col = idx % N; + out[idx] = from_float(to_float(out[idx]) + to_float(bias[col])); +} + +void launch_bias( + std::byte *out, + const std::byte *bias, + llaisysDataType_t type, + size_t M, + size_t N, + cudaStream_t stream) { + const size_t total = M * N; + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(total, threads); + + switch (type) { + case LLAISYS_DTYPE_F32: + add_bias_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(bias), + M, + N); + break; + case LLAISYS_DTYPE_F16: + add_bias_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(bias), + M, + N); + break; + case LLAISYS_DTYPE_BF16: + add_bias_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(bias), + M, + N); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + check_cuda(cudaGetLastError(), "linear add_bias kernel launch"); +} + +void linear( + std::byte *out, + const std::byte *in, + const std::byte *weight, + const std::byte *bias, + llaisysDataType_t type, + size_t M, + size_t N, + size_t K) { + // row-major: out[M, N] = in[M, K] * weight[N, K]^T + // map to column-major GEMM: + // out_col[N, M] = op(weight_col[K, N]) * in_col[K, M] + // where op = transpose, so result is equivalent to row-major formula. + const float alpha = 1.0f; + const float beta = 0.0f; + + cudaDataType_t data_type; + switch (type) { + case LLAISYS_DTYPE_F32: + data_type = CUDA_R_32F; + break; + case LLAISYS_DTYPE_F16: + data_type = CUDA_R_16F; + break; + case LLAISYS_DTYPE_BF16: + data_type = CUDA_R_16BF; + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + auto stream = current_stream(); + cublasHandle_t handle = get_cublas_handle(); + check_cublas(cublasSetStream(handle, stream), "cublasSetStream"); + + check_cublas( + cublasGemmEx( + handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + static_cast(N), + static_cast(M), + static_cast(K), + &alpha, + weight, + data_type, + static_cast(K), + in, + data_type, + static_cast(K), + &beta, + out, + data_type, + static_cast(N), + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP), + "cublasGemmEx"); + + if (bias != nullptr) { + launch_bias(out, bias, type, M, N, stream); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/linear/nvidia/linear_nvidia.hpp b/src/ops/linear/nvidia/linear_nvidia.hpp new file mode 100644 index 000000000..72f5440b2 --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void linear( + std::byte *out, + const std::byte *in, + const std::byte *weight, + const std::byte *bias, + llaisysDataType_t type, + size_t M, + size_t N, + size_t K); +} diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 5222917c3..487e2deb2 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/linear_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/linear_nvidia.hpp" +#endif namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { @@ -27,6 +30,18 @@ void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { out->dtype(), M, N, K); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + const std::byte *bias_ptr = bias ? bias->data() : nullptr; + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::linear(out->data(), in->data(), weight->data(), bias_ptr, out->dtype(), M, N, K); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::linear(out->data(), in->data(), weight->data(), bias_ptr, out->dtype(), M, N, K); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } -} // namespace llaisys::ops \ No newline at end of file +} // namespace llaisys::ops diff --git a/src/ops/nvidia_cuda.cuh b/src/ops/nvidia_cuda.cuh new file mode 100644 index 000000000..4c41bb544 --- /dev/null +++ b/src/ops/nvidia_cuda.cuh @@ -0,0 +1,77 @@ +#pragma once + +#include "../core/llaisys_core.hpp" +#include "../utils.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { + +inline void check_cuda(cudaError_t err, const char *msg) { + if (err != cudaSuccess) { + std::cerr << "[CUDA ERROR] " << msg << ": " << cudaGetErrorString(err) << std::endl; + throw std::runtime_error("CUDA API failed"); + } +} + +inline cudaStream_t current_stream() { + return reinterpret_cast(llaisys::core::context().runtime().stream()); +} + +template +__device__ inline float to_float(T v); + +template <> +__device__ inline float to_float(float v) { + return v; +} + +template <> +__device__ inline float to_float(llaisys::fp16_t v) { + return __half2float(__ushort_as_half(v._v)); +} + +template <> +__device__ inline float to_float(llaisys::bf16_t v) { + return __bfloat162float(__ushort_as_bfloat16(v._v)); +} + +template +__device__ inline T from_float(float v); + +template <> +__device__ inline float from_float(float v) { + return v; +} + +template <> +__device__ inline llaisys::fp16_t from_float(float v) { + llaisys::fp16_t out; + out._v = __half_as_ushort(__float2half_rn(v)); + return out; +} + +template <> +__device__ inline llaisys::bf16_t from_float(float v) { + llaisys::bf16_t out; + out._v = __bfloat16_as_ushort(__float2bfloat16(v)); + return out; +} + +inline int num_threads_1d() { + return 256; +} + +inline int num_blocks_1d(size_t n, int threads = 256) { + return static_cast((n + threads - 1) / threads); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/ops/rearrange/nvidia/rearrange_nvidia.cu new file mode 100644 index 000000000..e332702be --- /dev/null +++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cu @@ -0,0 +1,108 @@ +#include "rearrange_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void rearrange_kernel( + T *out, + const T *in, + const int64_t *shape, + const int64_t *in_strides, + const int64_t *out_strides, + int rank, + size_t total) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= total) { + return; + } + + size_t t = idx; + int64_t in_off = 0; + int64_t out_off = 0; + + for (int d = rank - 1; d >= 0; --d) { + int64_t cur = static_cast(t % static_cast(shape[d])); + t /= static_cast(shape[d]); + in_off += cur * in_strides[d]; + out_off += cur * out_strides[d]; + } + + out[static_cast(out_off)] = in[static_cast(in_off)]; +} + +template +void launch_rearrange( + std::byte *out, + const std::byte *in, + const std::vector &shape, + const std::vector &in_strides, + const std::vector &out_strides) { + const int rank = static_cast(shape.size()); + std::vector h_shape(rank); + std::vector h_in_strides(rank); + std::vector h_out_strides(rank); + + size_t total = 1; + for (int i = 0; i < rank; ++i) { + h_shape[i] = static_cast(shape[i]); + h_in_strides[i] = static_cast(in_strides[i]); + h_out_strides[i] = static_cast(out_strides[i]); + total *= shape[i]; + } + + int64_t *d_shape = nullptr; + int64_t *d_in_strides = nullptr; + int64_t *d_out_strides = nullptr; + + const size_t meta_bytes = static_cast(rank) * sizeof(int64_t); + auto stream = current_stream(); + + check_cuda(cudaMalloc(&d_shape, meta_bytes), "cudaMalloc d_shape"); + check_cuda(cudaMalloc(&d_in_strides, meta_bytes), "cudaMalloc d_in_strides"); + check_cuda(cudaMalloc(&d_out_strides, meta_bytes), "cudaMalloc d_out_strides"); + + check_cuda(cudaMemcpyAsync(d_shape, h_shape.data(), meta_bytes, cudaMemcpyHostToDevice, stream), "copy shape"); + check_cuda(cudaMemcpyAsync(d_in_strides, h_in_strides.data(), meta_bytes, cudaMemcpyHostToDevice, stream), "copy in_strides"); + check_cuda(cudaMemcpyAsync(d_out_strides, h_out_strides.data(), meta_bytes, cudaMemcpyHostToDevice, stream), "copy out_strides"); + + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(total, threads); + rearrange_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + d_shape, + d_in_strides, + d_out_strides, + rank, + total); + check_cuda(cudaGetLastError(), "rearrange_kernel launch"); + + check_cuda(cudaFree(d_shape), "cudaFree d_shape"); + check_cuda(cudaFree(d_in_strides), "cudaFree d_in_strides"); + check_cuda(cudaFree(d_out_strides), "cudaFree d_out_strides"); +} + +void rearrange( + std::byte *out, + const std::byte *in, + llaisysDataType_t type, + const std::vector &shape, + const std::vector &in_strides, + const std::vector &out_strides) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_rearrange(out, in, shape, in_strides, out_strides); + case LLAISYS_DTYPE_F16: + return launch_rearrange(out, in, shape, in_strides, out_strides); + case LLAISYS_DTYPE_BF16: + return launch_rearrange(out, in, shape, in_strides, out_strides); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.hpp b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp new file mode 100644 index 000000000..6784a8b96 --- /dev/null +++ b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void rearrange( + std::byte *out, + const std::byte *in, + llaisysDataType_t type, + const std::vector &shape, + const std::vector &in_strides, + const std::vector &out_strides); +} diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp index 5542d1345..8a9431e30 100644 --- a/src/ops/rearrange/op.cpp +++ b/src/ops/rearrange/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/rearrange_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rearrange_nvidia.hpp" +#endif namespace llaisys::ops { void rearrange(tensor_t out, tensor_t in) { @@ -15,6 +18,17 @@ void rearrange(tensor_t out, tensor_t in) { in->shape(), in->strides(), out->strides()); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rearrange(out->data(), in->data(), out->dtype(), in->shape(), in->strides(), out->strides()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rearrange(out->data(), in->data(), out->dtype(), in->shape(), in->strides(), out->strides()); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} } -} \ No newline at end of file diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu new file mode 100644 index 000000000..e5d35c0c5 --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -0,0 +1,85 @@ +#include "rms_norm_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +namespace llaisys::ops::nvidia { + +template +__global__ void rms_norm_kernel(T *out, const T *in, const T *weight, size_t M, size_t d, float eps) { + size_t row = blockIdx.x; + if (row >= M) { + return; + } + + extern __shared__ float shm[]; + float *s_sum = shm; + + const T *row_in = in + row * d; + T *row_out = out + row * d; + + float local_sum = 0.0f; + for (size_t j = threadIdx.x; j < d; j += blockDim.x) { + float x = to_float(row_in[j]); + local_sum += x * x; + } + + s_sum[threadIdx.x] = local_sum; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + s_sum[threadIdx.x] += s_sum[threadIdx.x + stride]; + } + __syncthreads(); + } + + float inv_rms = rsqrtf(s_sum[0] / static_cast(d) + eps); + for (size_t j = threadIdx.x; j < d; j += blockDim.x) { + float x = to_float(row_in[j]); + float w = to_float(weight[j]); + row_out[j] = from_float(x * w * inv_rms); + } +} + +template +void launch_rms_norm( + std::byte *out, + const std::byte *in, + const std::byte *weight, + size_t M, + size_t d, + float eps) { + const int threads = 256; + const size_t shared_bytes = static_cast(threads) * sizeof(float); + auto stream = current_stream(); + rms_norm_kernel<<(M), threads, shared_bytes, stream>>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + M, + d, + eps); + check_cuda(cudaGetLastError(), "rms_norm_kernel launch"); +} + +void rms_norm( + std::byte *out, + const std::byte *in, + const std::byte *weight, + llaisysDataType_t type, + size_t M, + size_t d, + float eps) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_rms_norm(out, in, weight, M, d, eps); + case LLAISYS_DTYPE_F16: + return launch_rms_norm(out, in, weight, M, d, eps); + case LLAISYS_DTYPE_BF16: + return launch_rms_norm(out, in, weight, M, d, eps); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp new file mode 100644 index 000000000..8892a1dab --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rms_norm( + std::byte *out, + const std::byte *in, + const std::byte *weight, + llaisysDataType_t type, + size_t M, + size_t d, + float eps); +} diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 9b0005857..a971b3ee1 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/rms_norm_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rms_norm_nvidia.hpp" +#endif namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { @@ -22,6 +25,17 @@ void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { out->dtype(), M, d, eps); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), M, d, eps); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), M, d, eps); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } -} // namespace llaisys::ops \ No newline at end of file +} // namespace llaisys::ops diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu new file mode 100644 index 000000000..f6ecbb60d --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.cu @@ -0,0 +1,92 @@ +#include "rope_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void rope_kernel( + T *out, + const T *in, + const int64_t *pos_ids, + size_t seqlen, + size_t nhead, + size_t d, + float theta) { + size_t half_d = d / 2; + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = seqlen * nhead * half_d; + if (idx >= total) { + return; + } + + size_t j = idx % half_d; + size_t tmp = idx / half_d; + size_t h = tmp % nhead; + size_t i = tmp / nhead; + + double pos = static_cast(pos_ids[i]); + double exponent = (2.0 * static_cast(j)) / static_cast(d); + double angle = pos / pow(static_cast(theta), exponent); + double c = cos(angle); + double s = sin(angle); + + size_t base = (i * nhead + h) * d; + size_t idx_a = base + j; + size_t idx_b = base + j + half_d; + + double a = static_cast(to_float(in[idx_a])); + double b = static_cast(to_float(in[idx_b])); + + out[idx_a] = from_float(static_cast(a * c - b * s)); + out[idx_b] = from_float(static_cast(b * c + a * s)); +} + +template +void launch_rope( + std::byte *out, + const std::byte *in, + const std::byte *pos_ids, + size_t seqlen, + size_t nhead, + size_t d, + float theta) { + size_t total = seqlen * nhead * (d / 2); + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(total, threads); + auto stream = current_stream(); + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + seqlen, + nhead, + d, + theta); + check_cuda(cudaGetLastError(), "rope_kernel launch"); +} + +void rope( + std::byte *out, + const std::byte *in, + const std::byte *pos_ids, + llaisysDataType_t type, + size_t seqlen, + size_t nhead, + size_t d, + float theta) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_rope(out, in, pos_ids, seqlen, nhead, d, theta); + case LLAISYS_DTYPE_F16: + return launch_rope(out, in, pos_ids, seqlen, nhead, d, theta); + case LLAISYS_DTYPE_BF16: + return launch_rope(out, in, pos_ids, seqlen, nhead, d, theta); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/rope/nvidia/rope_nvidia.hpp b/src/ops/rope/nvidia/rope_nvidia.hpp new file mode 100644 index 000000000..a93fa9159 --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rope( + std::byte *out, + const std::byte *in, + const std::byte *pos_ids, + llaisysDataType_t type, + size_t seqlen, + size_t nhead, + size_t d, + float theta); +} diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index b56241b59..d69b21331 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/rope_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/rope_nvidia.hpp" +#endif namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { @@ -24,6 +27,17 @@ void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { out->dtype(), seqlen, nhead, d, theta); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rope(out->data(), in->data(), pos_ids->data(), out->dtype(), seqlen, nhead, d, theta); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::rope(out->data(), in->data(), pos_ids->data(), out->dtype(), seqlen, nhead, d, theta); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } -} // namespace llaisys::ops \ No newline at end of file +} // namespace llaisys::ops diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu new file mode 100644 index 000000000..22a525f17 --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu @@ -0,0 +1,149 @@ +#include "self_attention_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void self_attention_kernel( + T *attn_val, + const T *q, + const T *k, + const T *v, + size_t seqlen, + size_t total_len, + size_t nhead, + size_t nkvhead, + size_t d, + size_t dv, + float scale) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = seqlen * nhead * dv; + if (idx >= total) { + return; + } + + size_t cdv = idx % dv; + size_t t = idx / dv; + size_t h = t % nhead; + size_t i = t / nhead; + + size_t n_groups = nhead / nkvhead; + size_t h_kv = h / n_groups; + + ptrdiff_t cutoff = static_cast(total_len) - static_cast(seqlen) + static_cast(i); + + float max_score = -INFINITY; + for (size_t j = 0; j < total_len; ++j) { + if (static_cast(j) > cutoff) { + continue; + } + + float dot = 0.0f; + size_t q_base = (i * nhead + h) * d; + size_t k_base = (j * nkvhead + h_kv) * d; + for (size_t c = 0; c < d; ++c) { + dot += to_float(q[q_base + c]) * to_float(k[k_base + c]); + } + float score = dot * scale; + max_score = fmaxf(max_score, score); + } + + float sum_exp = 0.0f; + for (size_t j = 0; j < total_len; ++j) { + if (static_cast(j) > cutoff) { + continue; + } + + float dot = 0.0f; + size_t q_base = (i * nhead + h) * d; + size_t k_base = (j * nkvhead + h_kv) * d; + for (size_t c = 0; c < d; ++c) { + dot += to_float(q[q_base + c]) * to_float(k[k_base + c]); + } + float score = dot * scale; + sum_exp += expf(score - max_score); + } + + float out_val = 0.0f; + for (size_t j = 0; j < total_len; ++j) { + if (static_cast(j) > cutoff) { + continue; + } + + float dot = 0.0f; + size_t q_base = (i * nhead + h) * d; + size_t k_base = (j * nkvhead + h_kv) * d; + for (size_t c = 0; c < d; ++c) { + dot += to_float(q[q_base + c]) * to_float(k[k_base + c]); + } + float score = dot * scale; + float prob = expf(score - max_score) / sum_exp; + + size_t v_idx = (j * nkvhead + h_kv) * dv + cdv; + out_val += prob * to_float(v[v_idx]); + } + + attn_val[idx] = from_float(out_val); +} + +template +void launch_self_attention( + std::byte *attn_val, + const std::byte *q, + const std::byte *k, + const std::byte *v, + size_t seqlen, + size_t total_len, + size_t nhead, + size_t nkvhead, + size_t d, + size_t dv, + float scale) { + size_t total = seqlen * nhead * dv; + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(total, threads); + auto stream = current_stream(); + self_attention_kernel<<>>( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + seqlen, + total_len, + nhead, + nkvhead, + d, + dv, + scale); + check_cuda(cudaGetLastError(), "self_attention_kernel launch"); +} + +void self_attention( + std::byte *attn_val, + const std::byte *q, + const std::byte *k, + const std::byte *v, + llaisysDataType_t type, + size_t seqlen, + size_t total_len, + size_t nhead, + size_t nkvhead, + size_t d, + size_t dv, + float scale) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_self_attention(attn_val, q, k, v, seqlen, total_len, nhead, nkvhead, d, dv, scale); + case LLAISYS_DTYPE_F16: + return launch_self_attention(attn_val, q, k, v, seqlen, total_len, nhead, nkvhead, d, dv, scale); + case LLAISYS_DTYPE_BF16: + return launch_self_attention(attn_val, q, k, v, seqlen, total_len, nhead, nkvhead, d, dv, scale); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.hpp b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp new file mode 100644 index 000000000..ec7139b4b --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void self_attention( + std::byte *attn_val, + const std::byte *q, + const std::byte *k, + const std::byte *v, + llaisysDataType_t type, + size_t seqlen, + size_t total_len, + size_t nhead, + size_t nkvhead, + size_t d, + size_t dv, + float scale); +} diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index b139964de..e98802461 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/self_attention_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/self_attention_nvidia.hpp" +#endif namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { @@ -24,6 +27,19 @@ void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId()); + + switch (attn_val->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + attn_val->dtype(), seqlen, total_len, nhead, nkvhead, d, dv, scale); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} } -} \ No newline at end of file diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu new file mode 100644 index 000000000..a1f0f924b --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -0,0 +1,53 @@ +#include "swiglu_nvidia.hpp" + +#include "../../nvidia_cuda.cuh" + +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void swiglu_kernel(T *out, const T *gate, const T *up, size_t numel) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } + + const float g = to_float(gate[idx]); + const float u = to_float(up[idx]); + const float silu = g / (1.0f + expf(-g)); + out[idx] = from_float(u * silu); +} + +template +void launch_swiglu(std::byte *out, const std::byte *gate, const std::byte *up, size_t numel) { + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(numel, threads); + auto stream = current_stream(); + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + numel); + check_cuda(cudaGetLastError(), "swiglu_kernel launch"); +} + +void swiglu( + std::byte *out, + const std::byte *gate, + const std::byte *up, + llaisysDataType_t type, + size_t num_elements) { + switch (type) { + case LLAISYS_DTYPE_F32: + return launch_swiglu(out, gate, up, num_elements); + case LLAISYS_DTYPE_F16: + return launch_swiglu(out, gate, up, num_elements); + case LLAISYS_DTYPE_BF16: + return launch_swiglu(out, gate, up, num_elements); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.hpp b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp new file mode 100644 index 000000000..468a79d5c --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void swiglu( + std::byte *out, + const std::byte *gate, + const std::byte *up, + llaisysDataType_t type, + size_t num_elements); +} diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 7f224b214..f8b07dfbf 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -2,6 +2,9 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/swiglu_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/swiglu_nvidia.hpp" +#endif namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { @@ -24,6 +27,17 @@ void swiglu(tensor_t out, tensor_t gate, tensor_t up) { out->dtype(), num_elements); } - EXCEPTION_UNSUPPORTED_DEVICE; + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), num_elements); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), num_elements); +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} } -} \ No newline at end of file diff --git a/test/ops/self_attention.py b/test/ops/self_attention.py index a042b51be..abf3927a8 100644 --- a/test/ops/self_attention.py +++ b/test/ops/self_attention.py @@ -15,7 +15,7 @@ def torch_self_attention(attn_val, query, key, value, scale): L, S = query.size(-2), key.size(-2) attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device) - temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=S-L) + temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).tril(diagonal=S-L) attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) attn_bias.to(query.dtype) diff --git a/xmake.lua b/xmake.lua index 095d96875..f63954ef9 100644 --- a/xmake.lua +++ b/xmake.lua @@ -37,6 +37,9 @@ target("llaisys-device") set_kind("static") add_deps("llaisys-utils") add_deps("llaisys-device-cpu") + if has_config("nv-gpu") then + add_deps("llaisys-device-nvidia") + end set_languages("cxx17") set_warnings("all", "error") @@ -83,6 +86,9 @@ target_end() target("llaisys-ops") set_kind("static") add_deps("llaisys-ops-cpu") + if has_config("nv-gpu") then + add_deps("llaisys-ops-nvidia") + end set_languages("cxx17") set_warnings("all", "error") @@ -120,4 +126,4 @@ target("llaisys") os.cp("lib/*.so", "python/llaisys/libllaisys/") end end) -target_end() \ No newline at end of file +target_end() diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua new file mode 100644 index 000000000..bbd2e13c3 --- /dev/null +++ b/xmake/nvidia.lua @@ -0,0 +1,36 @@ +target("llaisys-device-nvidia") + set_kind("static") + set_languages("cxx17") + set_warnings("all", "error") + add_includedirs("/usr/local/cuda/include") + add_linkdirs("/usr/local/cuda/lib64") + add_links("cudart") + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + end + + add_files("../src/device/nvidia/*.cpp") + + on_install(function (target) end) +target_end() + +target("llaisys-ops-nvidia") + set_kind("static") + add_deps("llaisys-tensor") + + add_rules("cuda") + set_values("cuda.rdc", false) + + set_languages("cxx17") + set_warnings("all", "error") + add_includedirs("/usr/local/cuda/include") + add_linkdirs("/usr/local/cuda/lib64") + add_links("cudart", "cublas") + if not is_plat("windows") then + add_cuflags("-Xcompiler=-fPIC") + end + + add_files("../src/ops/*/nvidia/*.cu") + + on_install(function (target) end) +target_end() From 9cb8bb3e9c7df205bdcc1a6cddb77c2af1941e8e Mon Sep 17 00:00:00 2001 From: mengyijia49 Date: Mon, 16 Mar 2026 15:48:11 +0800 Subject: [PATCH 7/9] Add random sampling and minimal chat server/CLI with NVIDIA build support --- include/llaisys/models/qwen2.h | 8 + include/llaisys/ops.h | 7 + python/llaisys/chat/__init__.py | 3 + python/llaisys/chat/cli.py | 137 ++++++++++ python/llaisys/chat/engine.py | 112 +++++++++ python/llaisys/chat/server.py | 189 ++++++++++++++ python/llaisys/libllaisys/ops.py | 12 +- python/llaisys/libllaisys/qwen2.py | 16 ++ python/llaisys/models/qwen2.py | 36 ++- python/llaisys/ops.py | 20 +- src/llaisys/ops.cc | 16 ++ src/llaisys/qwen2.cc | 65 ++++- src/ops/nvidia_cuda.cuh | 7 +- .../random_sample/cpu/random_sample_cpu.cpp | 186 ++++++++++++++ .../random_sample/cpu/random_sample_cpu.hpp | 18 ++ .../nvidia/random_sample_nvidia.cu | 236 ++++++++++++++++++ .../nvidia/random_sample_nvidia.hpp | 18 ++ src/ops/random_sample/op.cpp | 83 ++++++ src/ops/random_sample/op.hpp | 16 ++ xmake/nvidia.lua | 36 ++- 20 files changed, 1197 insertions(+), 24 deletions(-) create mode 100644 python/llaisys/chat/__init__.py create mode 100644 python/llaisys/chat/cli.py create mode 100644 python/llaisys/chat/engine.py create mode 100644 python/llaisys/chat/server.py create mode 100644 src/ops/random_sample/cpu/random_sample_cpu.cpp create mode 100644 src/ops/random_sample/cpu/random_sample_cpu.hpp create mode 100644 src/ops/random_sample/nvidia/random_sample_nvidia.cu create mode 100644 src/ops/random_sample/nvidia/random_sample_nvidia.hpp create mode 100644 src/ops/random_sample/op.cpp create mode 100644 src/ops/random_sample/op.hpp diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h index 296f0f15f..04f043a2f 100644 --- a/include/llaisys/models/qwen2.h +++ b/include/llaisys/models/qwen2.h @@ -47,6 +47,14 @@ __C { __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name); __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + __export int64_t llaisysQwen2ModelInferSample( + struct LlaisysQwen2Model * model, + int64_t * token_ids, + size_t ntoken, + float temperature, + size_t top_k, + float top_p, + uint64_t seed); // KV cache APIs __export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens); diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h index ddb3be246..dd46a18b5 100644 --- a/include/llaisys/ops.h +++ b/include/llaisys/ops.h @@ -6,6 +6,13 @@ __C { __export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b); __export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals); + __export void llaisysRandomSample( + llaisysTensor_t out_idx, + llaisysTensor_t logits, + float temperature, + size_t top_k, + float top_p, + uint64_t seed); __export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight); __export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias); __export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in); diff --git a/python/llaisys/chat/__init__.py b/python/llaisys/chat/__init__.py new file mode 100644 index 000000000..ee6c0897e --- /dev/null +++ b/python/llaisys/chat/__init__.py @@ -0,0 +1,3 @@ +from .engine import ChatEngine + +__all__ = ["ChatEngine"] diff --git a/python/llaisys/chat/cli.py b/python/llaisys/chat/cli.py new file mode 100644 index 000000000..de7f27a64 --- /dev/null +++ b/python/llaisys/chat/cli.py @@ -0,0 +1,137 @@ +import argparse +import json +import sys +import urllib.error +import urllib.request +from typing import Any, Dict, List, Optional + + +def _post_json(url: str, payload: Dict[str, Any], stream: bool): + body = json.dumps(payload, ensure_ascii=False).encode("utf-8") + headers = { + "Content-Type": "application/json", + "Accept": "text/event-stream" if stream else "application/json", + } + req = urllib.request.Request(url, data=body, headers=headers, method="POST") + return urllib.request.urlopen(req, timeout=600) + + +def _send_chat( + endpoint: str, + model: str, + messages: List[Dict[str, str]], + max_new_tokens: int, + temperature: float, + top_k: int, + top_p: float, + seed: Optional[int], + stream: bool, +) -> str: + payload = { + "model": model, + "messages": messages, + "max_new_tokens": int(max_new_tokens), + "temperature": float(temperature), + "top_k": int(top_k), + "top_p": float(top_p), + "stream": bool(stream), + } + if seed is not None: + payload["seed"] = int(seed) + + if not stream: + with _post_json(endpoint, payload, stream=False) as resp: + data = json.loads(resp.read().decode("utf-8")) + return data["choices"][0]["message"]["content"] + + assistant_text = "" + with _post_json(endpoint, payload, stream=True) as resp: + for raw in resp: + line = raw.decode("utf-8").strip() + if not line or not line.startswith("data:"): + continue + content = line[5:].strip() + if content == "[DONE]": + break + + chunk = json.loads(content) + choices = chunk.get("choices", []) + if not choices: + continue + delta = choices[0].get("delta", {}) + piece = delta.get("content", "") + if piece: + sys.stdout.write(piece) + sys.stdout.flush() + assistant_text += piece + sys.stdout.write("\n") + sys.stdout.flush() + return assistant_text + + +def main(): + parser = argparse.ArgumentParser(description="LLAISYS interactive chat CLI") + parser.add_argument("--server", default="http://127.0.0.1:8000") + parser.add_argument("--model", default="qwen2") + parser.add_argument("--max-new-tokens", default=256, type=int) + parser.add_argument("--temperature", default=0.8, type=float) + parser.add_argument("--top-k", default=50, type=int) + parser.add_argument("--top-p", default=0.9, type=float) + parser.add_argument("--seed", default=None, type=int) + parser.add_argument("--system", default=None, type=str) + parser.add_argument("--stream", action="store_true") + args = parser.parse_args() + + endpoint = args.server.rstrip("/") + "/v1/chat/completions" + messages: List[Dict[str, str]] = [] + if args.system: + messages.append({"role": "system", "content": args.system}) + + print("Commands: /exit to quit, /reset to clear history.") + while True: + try: + user_text = input("You: ").strip() + except EOFError: + print() + break + + if not user_text: + continue + if user_text in ("/exit", "/quit"): + break + if user_text == "/reset": + messages = [] + if args.system: + messages.append({"role": "system", "content": args.system}) + print("History cleared.") + continue + + messages.append({"role": "user", "content": user_text}) + + try: + if args.stream: + sys.stdout.write("Assistant: ") + sys.stdout.flush() + assistant_text = _send_chat( + endpoint=endpoint, + model=args.model, + messages=messages, + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + seed=args.seed, + stream=args.stream, + ) + if not args.stream: + print(f"Assistant: {assistant_text}") + messages.append({"role": "assistant", "content": assistant_text}) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + print(f"HTTP {exc.code}: {detail}") + except Exception as exc: # pragma: no cover - network/runtime errors + print(f"Request failed: {exc}") + + +if __name__ == "__main__": + main() diff --git a/python/llaisys/chat/engine.py b/python/llaisys/chat/engine.py new file mode 100644 index 000000000..201b952a7 --- /dev/null +++ b/python/llaisys/chat/engine.py @@ -0,0 +1,112 @@ +from typing import Any, Dict, Iterable, List, Optional + +from transformers import AutoTokenizer + +from .. import DeviceType +from ..models import Qwen2 + + +def _normalize_content(content: Any) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + chunks: List[str] = [] + for item in content: + if isinstance(item, dict): + if item.get("type") == "text" and isinstance(item.get("text"), str): + chunks.append(item["text"]) + elif isinstance(item, str): + chunks.append(item) + return "".join(chunks) + return str(content) + + +def _normalize_messages(messages: Iterable[Dict[str, Any]]) -> List[Dict[str, str]]: + conversation: List[Dict[str, str]] = [] + for message in messages: + if not isinstance(message, dict): + continue + role = str(message.get("role", "user")) + content = _normalize_content(message.get("content", "")) + conversation.append({"role": role, "content": content}) + return conversation + + +class ChatEngine: + def __init__( + self, + model_path: str, + device: str = "cpu", + max_new_tokens: int = 256, + top_k: int = 50, + top_p: float = 0.9, + temperature: float = 0.8, + ): + self.model_path = model_path + self.max_new_tokens = int(max_new_tokens) + self.top_k = int(top_k) + self.top_p = float(top_p) + self.temperature = float(temperature) + + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if device.lower() == "nvidia": + model_device = DeviceType.NVIDIA + else: + model_device = DeviceType.CPU + self.model = Qwen2(model_path, model_device) + + def complete( + self, + messages: Iterable[Dict[str, Any]], + max_new_tokens: Optional[int] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + temperature: Optional[float] = None, + seed: Optional[int] = None, + ) -> Dict[str, Any]: + conversation = _normalize_messages(messages) + input_content = self.tokenizer.apply_chat_template( + conversation=conversation, + add_generation_prompt=True, + tokenize=False, + ) + input_ids = self.tokenizer.encode(input_content) + + out_ids = self.model.generate( + input_ids, + max_new_tokens=self.max_new_tokens if max_new_tokens is None else int(max_new_tokens), + top_k=self.top_k if top_k is None else int(top_k), + top_p=self.top_p if top_p is None else float(top_p), + temperature=self.temperature if temperature is None else float(temperature), + seed=seed, + ) + + prompt_tokens = len(input_ids) + if len(out_ids) >= prompt_tokens: + generated_ids = out_ids[prompt_tokens:] + else: + generated_ids = [] + assistant_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True) + + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": len(generated_ids), + "total_tokens": prompt_tokens + len(generated_ids), + "output_ids": out_ids, + "generated_ids": generated_ids, + "assistant_text": assistant_text, + } + + def iter_text_deltas(self, generated_ids: Iterable[int]): + seen = "" + buffer: List[int] = [] + for token_id in generated_ids: + buffer.append(int(token_id)) + current = self.tokenizer.decode(buffer, skip_special_tokens=True) + if current.startswith(seen): + delta = current[len(seen):] + else: + delta = current + seen = current + if delta: + yield delta diff --git a/python/llaisys/chat/server.py b/python/llaisys/chat/server.py new file mode 100644 index 000000000..ff1da8044 --- /dev/null +++ b/python/llaisys/chat/server.py @@ -0,0 +1,189 @@ +import argparse +import json +import threading +import time +import uuid +from typing import Any, Dict, List, Optional + +try: + from fastapi import FastAPI + from fastapi.responses import JSONResponse, StreamingResponse + from pydantic import BaseModel +except Exception as exc: # pragma: no cover - runtime dependency guard + raise RuntimeError( + "FastAPI is required for chat server. Install with: pip install fastapi uvicorn" + ) from exc + +from .engine import ChatEngine + + +def _dump_model(obj: Any) -> Dict[str, Any]: + if hasattr(obj, "model_dump"): + return obj.model_dump() + if hasattr(obj, "dict"): + return obj.dict() + return dict(obj) + + +class ChatMessage(BaseModel): + role: str + content: Any + + +class ChatCompletionRequest(BaseModel): + model: str = "qwen2" + messages: List[ChatMessage] + max_tokens: Optional[int] = None + max_new_tokens: Optional[int] = None + temperature: float = 0.8 + top_p: float = 0.9 + top_k: int = 50 + seed: Optional[int] = None + stream: bool = False + + +def create_app( + model_path: str, + device: str = "cpu", + max_new_tokens: int = 256, + top_k: int = 50, + top_p: float = 0.9, + temperature: float = 0.8, +) -> FastAPI: + app = FastAPI(title="LLAISYS Chat Server", version="0.1.0") + engine = ChatEngine( + model_path=model_path, + device=device, + max_new_tokens=max_new_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ) + request_lock = threading.Lock() + + @app.get("/healthz") + def healthz(): + return {"status": "ok"} + + @app.post("/v1/chat/completions") + def chat_completions(req: ChatCompletionRequest): + request_model = req.model + request_messages = [_dump_model(m) for m in req.messages] + request_max_new_tokens = req.max_new_tokens + if request_max_new_tokens is None: + request_max_new_tokens = req.max_tokens + + with request_lock: + result = engine.complete( + messages=request_messages, + max_new_tokens=request_max_new_tokens, + top_k=req.top_k, + top_p=req.top_p, + temperature=req.temperature, + seed=req.seed, + ) + + completion_id = f"chatcmpl-{uuid.uuid4().hex}" + created = int(time.time()) + + if req.stream: + generated_ids = result["generated_ids"] + + def event_stream(): + role_chunk = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": request_model, + "choices": [ + { + "index": 0, + "delta": {"role": "assistant"}, + "finish_reason": None, + } + ], + } + yield f"data: {json.dumps(role_chunk, ensure_ascii=False)}\n\n" + + for delta in engine.iter_text_deltas(generated_ids): + chunk = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": request_model, + "choices": [ + { + "index": 0, + "delta": {"content": delta}, + "finish_reason": None, + } + ], + } + yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n" + + stop_chunk = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": request_model, + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + } + yield f"data: {json.dumps(stop_chunk, ensure_ascii=False)}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(event_stream(), media_type="text/event-stream") + + payload = { + "id": completion_id, + "object": "chat.completion", + "created": created, + "model": request_model, + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": result["assistant_text"]}, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": result["prompt_tokens"], + "completion_tokens": result["completion_tokens"], + "total_tokens": result["total_tokens"], + }, + } + return JSONResponse(payload) + + return app + + +def main(): + parser = argparse.ArgumentParser(description="LLAISYS OpenAI-compatible chat server") + parser.add_argument("--model", required=True, help="Path to model directory") + parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"]) + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", default=8000, type=int) + parser.add_argument("--max-new-tokens", default=256, type=int) + parser.add_argument("--top-k", default=50, type=int) + parser.add_argument("--top-p", default=0.9, type=float) + parser.add_argument("--temperature", default=0.8, type=float) + args = parser.parse_args() + + app = create_app( + model_path=args.model, + device=args.device, + max_new_tokens=args.max_new_tokens, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + ) + + try: + import uvicorn + except Exception as exc: # pragma: no cover - runtime dependency guard + raise RuntimeError("uvicorn is required. Install with: pip install uvicorn") from exc + + uvicorn.run(app, host=args.host, port=args.port) + + +if __name__ == "__main__": + main() diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py index 5be095eff..0c27b4421 100644 --- a/python/llaisys/libllaisys/ops.py +++ b/python/llaisys/libllaisys/ops.py @@ -1,5 +1,5 @@ from .tensor import llaisysTensor_t -from ctypes import c_float +from ctypes import c_float, c_size_t, c_uint64 def load_ops(lib): lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] @@ -8,6 +8,16 @@ def load_ops(lib): lib.llaisysArgmax.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] lib.llaisysArgmax.restype = None + lib.llaisysRandomSample.argtypes = [ + llaisysTensor_t, + llaisysTensor_t, + c_float, + c_size_t, + c_float, + c_uint64, + ] + lib.llaisysRandomSample.restype = None + lib.llaisysEmbedding.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] lib.llaisysEmbedding.restype = None diff --git a/python/llaisys/libllaisys/qwen2.py b/python/llaisys/libllaisys/qwen2.py index 0bd15295c..cd803ea18 100644 --- a/python/llaisys/libllaisys/qwen2.py +++ b/python/llaisys/libllaisys/qwen2.py @@ -7,6 +7,7 @@ c_int64, c_void_p, c_char_p, + c_uint64, ) from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t @@ -54,6 +55,21 @@ def load_qwen2(lib): else: print('[libllaisys.qwen2] Warning: llaisysQwen2ModelInfer not found in shared lib') + # int64_t llaisysQwen2ModelInferSample(struct LlaisysQwen2Model * model, + # int64_t * token_ids, size_t ntoken, + # float temperature, size_t top_k, float top_p, uint64_t seed); + if hasattr(lib, 'llaisysQwen2ModelInferSample'): + lib.llaisysQwen2ModelInferSample.argtypes = [ + c_void_p, + POINTER(c_int64), + c_size_t, + c_float, + c_size_t, + c_float, + c_uint64, + ] + lib.llaisysQwen2ModelInferSample.restype = c_int64 + # int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor); if hasattr(lib, 'llaisysQwen2ModelSetWeight'): lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p] diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index aee1fb4b6..6d840ce1b 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -3,6 +3,7 @@ from .. import Tensor from ..libllaisys.qwen2 import LlaisysQwen2Meta from pathlib import Path +import random import safetensors @@ -136,10 +137,11 @@ def generate( top_k: int = 1, top_p: float = 0.8, temperature: float = 0.8, + seed: int = None, ): if self._backend_model is not None: import ctypes - from ctypes import c_int64, c_size_t + from ctypes import c_float, c_int64, c_size_t, c_uint64 input_ids = [int(t) for t in inputs] if not input_ids: @@ -161,8 +163,36 @@ def generate( self._backend_kv = None self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap)) + sample_api_available = hasattr(LIB_LLAISYS, "llaisysQwen2ModelInferSample") + sample_top_k = max(0, int(top_k)) + sample_top_p = float(top_p) + sample_temperature = float(temperature) + rng = random.Random() if seed is None else random.Random(int(seed)) + + def infer_with_sampling(token_buffer, n_token): + if sample_api_available: + step_seed = rng.getrandbits(64) + return int( + LIB_LLAISYS.llaisysQwen2ModelInferSample( + self._backend_model, + token_buffer, + c_size_t(n_token), + c_float(sample_temperature), + c_size_t(sample_top_k), + c_float(sample_top_p), + c_uint64(step_seed), + ) + ) + return int( + LIB_LLAISYS.llaisysQwen2ModelInfer( + self._backend_model, + token_buffer, + c_size_t(n_token), + ) + ) + arr = (c_int64 * len(input_ids))(*input_ids) - next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(len(input_ids)))) + next_token = infer_with_sampling(arr, len(input_ids)) output_ids = list(input_ids) for _ in range(max_new_tokens): @@ -172,7 +202,7 @@ def generate( if self._end_token >= 0 and next_token == self._end_token: break arr = (c_int64 * 1)(next_token) - next_token = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self._backend_model, arr, c_size_t(1))) + next_token = infer_with_sampling(arr, 1) return output_ids diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py index ed0180bc8..d44cba18f 100644 --- a/python/llaisys/ops.py +++ b/python/llaisys/ops.py @@ -1,6 +1,6 @@ from .libllaisys import LIB_LLAISYS from .tensor import Tensor -from ctypes import c_float, c_int +from ctypes import c_float, c_size_t, c_uint64 class Ops: @@ -12,6 +12,24 @@ def add(c: Tensor, a: Tensor, b: Tensor): def argmax(max_idx: Tensor, max_val: Tensor, vals: Tensor): LIB_LLAISYS.llaisysArgmax(max_idx.lib_tensor(), max_val.lib_tensor(), vals.lib_tensor()) + @staticmethod + def random_sample( + out_idx: Tensor, + logits: Tensor, + temperature: float = 1.0, + top_k: int = 0, + top_p: float = 1.0, + seed: int = 0, + ): + LIB_LLAISYS.llaisysRandomSample( + out_idx.lib_tensor(), + logits.lib_tensor(), + c_float(temperature), + c_size_t(max(0, int(top_k))), + c_float(top_p), + c_uint64(int(seed) & 0xFFFFFFFFFFFFFFFF), + ) + @staticmethod def embedding(out: Tensor, index: Tensor, weight: Tensor): LIB_LLAISYS.llaisysEmbedding( diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc index ca8de527d..025ec2322 100644 --- a/src/llaisys/ops.cc +++ b/src/llaisys/ops.cc @@ -4,6 +4,7 @@ #include "../ops/add/op.hpp" #include "../ops/argmax/op.hpp" +#include "../ops/random_sample/op.hpp" #include "../ops/embedding/op.hpp" #include "../ops/linear/op.hpp" #include "../ops/rearrange/op.hpp" @@ -19,6 +20,21 @@ __C { void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals) { llaisys::ops::argmax(max_idx->tensor, max_val->tensor, vals->tensor); } + void llaisysRandomSample( + llaisysTensor_t out_idx, + llaisysTensor_t logits, + float temperature, + size_t top_k, + float top_p, + uint64_t seed) { + llaisys::ops::random_sample( + out_idx->tensor, + logits->tensor, + temperature, + top_k, + top_p, + seed); + } void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight) { llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor); } diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index 4166fe18f..1427d24f3 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -390,7 +390,27 @@ __export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, co return it != model->weight_map.end() ? 1 : 0; } -static int64_t infer_one_token(struct LlaisysQwen2Model *model, int64_t token_id) { +static int64_t read_scalar_i64(llaisysTensor_t tensor, const LlaisysRuntimeAPI *runtime_api, int64_t fallback) { + if (!tensor) { + return fallback; + } + if (tensor->tensor->deviceType() == LLAISYS_DEVICE_CPU) { + int64_t *ptr = reinterpret_cast(tensor->tensor->data()); + return ptr ? ptr[0] : fallback; + } + int64_t v = fallback; + runtime_api->memcpy_sync(&v, tensor->tensor->data(), sizeof(v), LLAISYS_MEMCPY_D2H); + return v; +} + +static int64_t infer_one_token( + struct LlaisysQwen2Model *model, + int64_t token_id, + bool do_sample, + float temperature, + size_t top_k, + float top_p, + uint64_t seed) { using namespace llaisys; if (!model->weights.in_embed || !model->weights.out_embed) { @@ -557,24 +577,25 @@ static int64_t infer_one_token(struct LlaisysQwen2Model *model, int64_t token_id auto logits = make_tensor({1, model->meta.voc}, dtype, model->device_type, model->device_id); llaisysLinear(logits, logits_in, model->weights.out_embed, nullptr); - auto max_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id); - auto max_val = make_tensor(one_shape, dtype, model->device_type, model->device_id); - llaisysArgmax(max_idx, max_val, logits); - int64_t next = model->meta.end_token; - if (max_idx->tensor->deviceType() == LLAISYS_DEVICE_CPU) { - int64_t *res_ptr = reinterpret_cast(max_idx->tensor->data()); - next = res_ptr ? res_ptr[0] : model->meta.end_token; + if (do_sample) { + auto sample_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id); + llaisysRandomSample(sample_idx, logits, temperature, top_k, top_p, seed); + next = read_scalar_i64(sample_idx, runtime_api, model->meta.end_token); + delete sample_idx; } else { - runtime_api->memcpy_sync(&next, max_idx->tensor->data(), sizeof(int64_t), LLAISYS_MEMCPY_D2H); + auto max_idx = make_tensor(one_shape, LLAISYS_DTYPE_I64, model->device_type, model->device_id); + auto max_val = make_tensor(one_shape, dtype, model->device_type, model->device_id); + llaisysArgmax(max_idx, max_val, logits); + next = read_scalar_i64(max_idx, runtime_api, model->meta.end_token); + delete max_idx; + delete max_val; } delete idx; delete x; if (out_norm) delete out_norm; delete logits; - delete max_idx; - delete max_val; return next; } @@ -585,7 +606,27 @@ __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_ int64_t next = model->meta.end_token; for (size_t i = 0; i < ntoken; ++i) { - next = infer_one_token(model, token_ids[i]); + next = infer_one_token(model, token_ids[i], false, 1.0f, 1, 1.0f, 0); + } + return next; +} + +__export int64_t llaisysQwen2ModelInferSample( + struct LlaisysQwen2Model * model, + int64_t * token_ids, + size_t ntoken, + float temperature, + size_t top_k, + float top_p, + uint64_t seed) { + if (!model) return -1; + if (ntoken == 0 || token_ids == nullptr) return model->meta.end_token; + + int64_t next = model->meta.end_token; + for (size_t i = 0; i < ntoken; ++i) { + const bool do_sample = (i + 1 == ntoken); + const uint64_t step_seed = seed + static_cast(i); + next = infer_one_token(model, token_ids[i], do_sample, temperature, top_k, top_p, step_seed); } return next; } diff --git a/src/ops/nvidia_cuda.cuh b/src/ops/nvidia_cuda.cuh index 4c41bb544..70749b723 100644 --- a/src/ops/nvidia_cuda.cuh +++ b/src/ops/nvidia_cuda.cuh @@ -41,7 +41,8 @@ __device__ inline float to_float(llaisys::fp16_t v) { template <> __device__ inline float to_float(llaisys::bf16_t v) { - return __bfloat162float(__ushort_as_bfloat16(v._v)); + const uint32_t bits = static_cast(v._v) << 16; + return __uint_as_float(bits); } template @@ -62,7 +63,9 @@ __device__ inline llaisys::fp16_t from_float(float v) { template <> __device__ inline llaisys::bf16_t from_float(float v) { llaisys::bf16_t out; - out._v = __bfloat16_as_ushort(__float2bfloat16(v)); + const uint32_t bits = __float_as_uint(v); + const uint32_t rounding_bias = 0x00007FFFu + ((bits >> 16) & 1u); + out._v = static_cast((bits + rounding_bias) >> 16); return out; } diff --git a/src/ops/random_sample/cpu/random_sample_cpu.cpp b/src/ops/random_sample/cpu/random_sample_cpu.cpp new file mode 100644 index 000000000..03d382c69 --- /dev/null +++ b/src/ops/random_sample/cpu/random_sample_cpu.cpp @@ -0,0 +1,186 @@ +#include "random_sample_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include +#include +#include +#include + +namespace llaisys::ops::cpu { + +template +void read_logits_to_float( + std::vector &dst, + const std::byte *src, + size_t numel, + float inv_temperature) { + const T *typed_src = reinterpret_cast(src); + dst.resize(numel); + for (size_t i = 0; i < numel; ++i) { + float v = llaisys::utils::cast(typed_src[i]); + if (!std::isfinite(v)) { + v = v > 0.0f ? 1e30f : -1e30f; + } + dst[i] = v * inv_temperature; + } +} + +static int64_t argmax_index(const std::vector &logits) { + int64_t best_idx = 0; + float best_val = logits[0]; + for (size_t i = 1; i < logits.size(); ++i) { + if (logits[i] > best_val) { + best_val = logits[i]; + best_idx = static_cast(i); + } + } + return best_idx; +} + +static int64_t sample_index_from_logits( + const std::vector &scaled_logits, + size_t top_k, + float top_p, + uint64_t seed) { + const size_t n = scaled_logits.size(); + if (n == 0) { + return 0; + } + + if (top_k == 0 || top_k > n) { + top_k = n; + } + if (!std::isfinite(top_p)) { + top_p = 1.0f; + } + if (top_p < 0.0f) { + top_p = 0.0f; + } + if (top_p > 1.0f) { + top_p = 1.0f; + } + + std::vector candidates(n); + std::iota(candidates.begin(), candidates.end(), size_t(0)); + + auto by_logit_desc = [&scaled_logits](size_t lhs, size_t rhs) { + const float lv = scaled_logits[lhs]; + const float rv = scaled_logits[rhs]; + if (lv == rv) { + return lhs < rhs; + } + return lv > rv; + }; + + if (top_k < n) { + std::nth_element( + candidates.begin(), + candidates.begin() + static_cast(top_k), + candidates.end(), + by_logit_desc); + candidates.resize(top_k); + } + + std::sort(candidates.begin(), candidates.end(), by_logit_desc); + + if (top_p <= 0.0f || candidates.size() == 1) { + return static_cast(candidates[0]); + } + + if (top_p < 1.0f) { + float max_logit = scaled_logits[candidates[0]]; + std::vector probs(candidates.size(), 0.0); + double prob_sum = 0.0; + for (size_t i = 0; i < candidates.size(); ++i) { + const double w = std::exp(static_cast(scaled_logits[candidates[i]] - max_logit)); + probs[i] = w; + prob_sum += w; + } + + if (!(prob_sum > 0.0) || !std::isfinite(prob_sum)) { + return static_cast(candidates[0]); + } + + double cumulative = 0.0; + size_t keep = 0; + for (size_t i = 0; i < candidates.size(); ++i) { + cumulative += probs[i] / prob_sum; + keep = i + 1; + if (cumulative >= static_cast(top_p)) { + break; + } + } + if (keep == 0) { + keep = 1; + } + candidates.resize(keep); + } + + float max_logit = -std::numeric_limits::infinity(); + for (size_t idx : candidates) { + if (scaled_logits[idx] > max_logit) { + max_logit = scaled_logits[idx]; + } + } + + std::vector weights(candidates.size(), 0.0); + double weight_sum = 0.0; + for (size_t i = 0; i < candidates.size(); ++i) { + const double w = std::exp(static_cast(scaled_logits[candidates[i]] - max_logit)); + weights[i] = w; + weight_sum += w; + } + + if (!(weight_sum > 0.0) || !std::isfinite(weight_sum)) { + return static_cast(candidates[0]); + } + + std::mt19937_64 rng(seed); + std::discrete_distribution dist(weights.begin(), weights.end()); + return static_cast(candidates[dist(rng)]); +} + +void random_sample( + std::byte *out_idx, + const std::byte *logits, + llaisysDataType_t type, + size_t numel, + float temperature, + size_t top_k, + float top_p, + uint64_t seed) { + ASSERT(out_idx != nullptr, "RandomSample: out_idx cannot be null."); + ASSERT(logits != nullptr, "RandomSample: logits cannot be null."); + ASSERT(numel > 0, "RandomSample: numel must be positive."); + + std::vector scaled_logits; + bool deterministic = !std::isfinite(temperature) || temperature <= 0.0f || top_k == 1 || top_p <= 0.0f; + const float inv_temperature = deterministic ? 1.0f : (1.0f / temperature); + + switch (type) { + case LLAISYS_DTYPE_F32: + read_logits_to_float(scaled_logits, logits, numel, inv_temperature); + break; + case LLAISYS_DTYPE_F16: + read_logits_to_float(scaled_logits, logits, numel, inv_temperature); + break; + case LLAISYS_DTYPE_BF16: + read_logits_to_float(scaled_logits, logits, numel, inv_temperature); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + int64_t sampled = 0; + if (deterministic) { + sampled = argmax_index(scaled_logits); + } else { + sampled = sample_index_from_logits(scaled_logits, top_k, top_p, seed); + } + *reinterpret_cast(out_idx) = sampled; +} + +} // namespace llaisys::ops::cpu diff --git a/src/ops/random_sample/cpu/random_sample_cpu.hpp b/src/ops/random_sample/cpu/random_sample_cpu.hpp new file mode 100644 index 000000000..1ac127c93 --- /dev/null +++ b/src/ops/random_sample/cpu/random_sample_cpu.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::cpu { +void random_sample( + std::byte *out_idx, + const std::byte *logits, + llaisysDataType_t type, + size_t numel, + float temperature, + size_t top_k, + float top_p, + uint64_t seed); +} diff --git a/src/ops/random_sample/nvidia/random_sample_nvidia.cu b/src/ops/random_sample/nvidia/random_sample_nvidia.cu new file mode 100644 index 000000000..68e5de7b4 --- /dev/null +++ b/src/ops/random_sample/nvidia/random_sample_nvidia.cu @@ -0,0 +1,236 @@ +#include "random_sample_nvidia.hpp" + +#include "../../argmax/nvidia/argmax_nvidia.hpp" +#include "../../nvidia_cuda.cuh" +#include "../cpu/random_sample_cpu.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { + +template +__global__ void cast_scale_kernel(float *out, const T *in, size_t n, float inv_temperature) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= n) { + return; + } + float v = to_float(in[idx]); + if (!isfinite(v)) { + v = v > 0.0f ? 1e30f : -1e30f; + } + out[idx] = v * inv_temperature; +} + +template +void launch_cast_scale(float *out, const std::byte *in, size_t n, float inv_temperature) { + const int threads = num_threads_1d(); + const int blocks = num_blocks_1d(n, threads); + auto stream = current_stream(); + cast_scale_kernel<<>>( + out, + reinterpret_cast(in), + n, + inv_temperature); + check_cuda(cudaGetLastError(), "random_sample cast_scale kernel launch"); +} + +__global__ void mask_one_kernel(float *vals, int64_t idx, size_t n) { + if (threadIdx.x == 0 && blockIdx.x == 0 && idx >= 0 && static_cast(idx) < n) { + vals[idx] = -1e30f; + } +} + +static int64_t sample_from_sorted_host( + const std::vector &sorted_logits, + const std::vector &sorted_indices, + float top_p, + uint64_t seed) { + if (sorted_logits.empty() || sorted_indices.empty()) { + return 0; + } + + if (!std::isfinite(top_p)) { + top_p = 1.0f; + } + if (top_p < 0.0f) { + top_p = 0.0f; + } + if (top_p > 1.0f) { + top_p = 1.0f; + } + + if (top_p <= 0.0f || sorted_logits.size() == 1) { + return sorted_indices[0]; + } + + const float max_logit = sorted_logits[0]; + std::vector weights(sorted_logits.size(), 0.0); + double weight_sum = 0.0; + for (size_t i = 0; i < sorted_logits.size(); ++i) { + const double w = std::exp(static_cast(sorted_logits[i] - max_logit)); + weights[i] = w; + weight_sum += w; + } + + if (!(weight_sum > 0.0) || !std::isfinite(weight_sum)) { + return sorted_indices[0]; + } + + size_t keep = sorted_logits.size(); + if (top_p < 1.0f) { + double cumulative = 0.0; + keep = 0; + for (size_t i = 0; i < weights.size(); ++i) { + cumulative += weights[i] / weight_sum; + keep = i + 1; + if (cumulative >= static_cast(top_p)) { + break; + } + } + if (keep == 0) { + keep = 1; + } + } + + std::mt19937_64 rng(seed); + std::discrete_distribution dist(weights.begin(), weights.begin() + static_cast(keep)); + return sorted_indices[dist(rng)]; +} + +void random_sample( + std::byte *out_idx, + const std::byte *logits, + llaisysDataType_t type, + size_t numel, + float temperature, + size_t top_k, + float top_p, + uint64_t seed) { + ASSERT(out_idx != nullptr, "RandomSample(nvidia): out_idx cannot be null."); + ASSERT(logits != nullptr, "RandomSample(nvidia): logits cannot be null."); + ASSERT(numel > 0, "RandomSample(nvidia): numel must be positive."); + + const bool deterministic = !std::isfinite(temperature) || temperature <= 0.0f || top_k == 1 || top_p <= 0.0f; + const float inv_temperature = deterministic ? 1.0f : (1.0f / temperature); + + float *d_scaled = nullptr; + int64_t *d_max_idx = nullptr; + float *d_max_val = nullptr; + + check_cuda(cudaMalloc(&d_scaled, numel * sizeof(float)), "random_sample cudaMalloc scaled"); + check_cuda(cudaMalloc(&d_max_idx, sizeof(int64_t)), "random_sample cudaMalloc max_idx"); + check_cuda(cudaMalloc(&d_max_val, sizeof(float)), "random_sample cudaMalloc max_val"); + + switch (type) { + case LLAISYS_DTYPE_F32: + launch_cast_scale(d_scaled, logits, numel, inv_temperature); + break; + case LLAISYS_DTYPE_F16: + launch_cast_scale(d_scaled, logits, numel, inv_temperature); + break; + case LLAISYS_DTYPE_BF16: + launch_cast_scale(d_scaled, logits, numel, inv_temperature); + break; + default: + cudaFree(d_scaled); + cudaFree(d_max_idx); + cudaFree(d_max_val); + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + + auto stream = current_stream(); + int64_t sampled_idx = 0; + + if (deterministic) { + llaisys::ops::nvidia::argmax( + reinterpret_cast(d_max_idx), + reinterpret_cast(d_max_val), + reinterpret_cast(d_scaled), + LLAISYS_DTYPE_F32, + numel); + check_cuda( + cudaMemcpyAsync(&sampled_idx, d_max_idx, sizeof(sampled_idx), cudaMemcpyDeviceToHost, stream), + "random_sample copy deterministic idx"); + check_cuda(cudaStreamSynchronize(stream), "random_sample sync deterministic idx"); + check_cuda( + cudaMemcpy(out_idx, &sampled_idx, sizeof(sampled_idx), cudaMemcpyHostToDevice), + "random_sample deterministic copy idx"); + cudaFree(d_scaled); + cudaFree(d_max_idx); + cudaFree(d_max_val); + return; + } + + size_t k = top_k; + if (k == 0 || k > numel) { + k = numel; + } + + // Avoid O(k * vocab) kernel loops for very large k by falling back to single D2H copy. + constexpr size_t kIterativeTopKLimit = 1024; + if (k > kIterativeTopKLimit) { + std::vector host_scaled(numel); + check_cuda( + cudaMemcpyAsync(host_scaled.data(), d_scaled, numel * sizeof(float), cudaMemcpyDeviceToHost, stream), + "random_sample fallback copy scaled"); + check_cuda(cudaStreamSynchronize(stream), "random_sample fallback sync copy scaled"); + cpu::random_sample( + reinterpret_cast(&sampled_idx), + reinterpret_cast(host_scaled.data()), + LLAISYS_DTYPE_F32, + numel, + 1.0f, + top_k, + top_p, + seed); + } else { + std::vector top_logits; + std::vector top_indices; + top_logits.reserve(k); + top_indices.reserve(k); + + for (size_t i = 0; i < k; ++i) { + float h_max_val = -1e30f; + int64_t h_max_idx = 0; + + llaisys::ops::nvidia::argmax( + reinterpret_cast(d_max_idx), + reinterpret_cast(d_max_val), + reinterpret_cast(d_scaled), + LLAISYS_DTYPE_F32, + numel); + + check_cuda( + cudaMemcpyAsync(&h_max_idx, d_max_idx, sizeof(h_max_idx), cudaMemcpyDeviceToHost, stream), + "random_sample copy topk idx"); + check_cuda( + cudaMemcpyAsync(&h_max_val, d_max_val, sizeof(h_max_val), cudaMemcpyDeviceToHost, stream), + "random_sample copy topk val"); + check_cuda(cudaStreamSynchronize(stream), "random_sample sync topk pair"); + + top_indices.push_back(h_max_idx); + top_logits.push_back(h_max_val); + + mask_one_kernel<<<1, 1, 0, stream>>>(d_scaled, h_max_idx, numel); + check_cuda(cudaGetLastError(), "random_sample mask_one kernel launch"); + } + + sampled_idx = sample_from_sorted_host(top_logits, top_indices, top_p, seed); + } + + check_cuda( + cudaMemcpy(out_idx, &sampled_idx, sizeof(sampled_idx), cudaMemcpyHostToDevice), + "random_sample sampled copy idx"); + + cudaFree(d_scaled); + cudaFree(d_max_idx); + cudaFree(d_max_val); +} + +} // namespace llaisys::ops::nvidia diff --git a/src/ops/random_sample/nvidia/random_sample_nvidia.hpp b/src/ops/random_sample/nvidia/random_sample_nvidia.hpp new file mode 100644 index 000000000..8f14e99a2 --- /dev/null +++ b/src/ops/random_sample/nvidia/random_sample_nvidia.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void random_sample( + std::byte *out_idx, + const std::byte *logits, + llaisysDataType_t type, + size_t numel, + float temperature, + size_t top_k, + float top_p, + uint64_t seed); +} diff --git a/src/ops/random_sample/op.cpp b/src/ops/random_sample/op.cpp new file mode 100644 index 000000000..37d3ee839 --- /dev/null +++ b/src/ops/random_sample/op.cpp @@ -0,0 +1,83 @@ +#include "op.hpp" + +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/random_sample_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "nvidia/random_sample_nvidia.hpp" +#endif + +#include + +namespace llaisys::ops { + +void random_sample( + tensor_t out_idx, + tensor_t logits, + float temperature, + size_t top_k, + float top_p, + uint64_t seed) { + CHECK_SAME_DEVICE(out_idx, logits); + ASSERT(out_idx->dtype() == LLAISYS_DTYPE_I64, "RandomSample: out_idx must be Int64."); + ASSERT(out_idx->numel() == 1, "RandomSample: out_idx must have one element."); + ASSERT(logits->isContiguous(), "RandomSample: logits must be contiguous."); + ASSERT(logits->numel() > 0, "RandomSample: logits must be non-empty."); + + if (logits->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::random_sample( + out_idx->data(), + logits->data(), + logits->dtype(), + logits->numel(), + temperature, + top_k, + top_p, + seed); + } + + llaisys::core::context().setDevice(logits->deviceType(), logits->deviceId()); + +#ifdef ENABLE_NVIDIA_API + if (logits->deviceType() == LLAISYS_DEVICE_NVIDIA) { + return nvidia::random_sample( + out_idx->data(), + logits->data(), + logits->dtype(), + logits->numel(), + temperature, + top_k, + top_p, + seed); + } +#endif + + const LlaisysRuntimeAPI *runtime_api = llaisys::core::context().runtime().api(); + + const size_t logits_bytes = logits->numel() * logits->elementSize(); + std::vector host_logits(logits_bytes); + runtime_api->memcpy_sync( + host_logits.data(), + logits->data(), + logits_bytes, + LLAISYS_MEMCPY_D2H); + + int64_t sampled_idx = 0; + cpu::random_sample( + reinterpret_cast(&sampled_idx), + host_logits.data(), + logits->dtype(), + logits->numel(), + temperature, + top_k, + top_p, + seed); + + runtime_api->memcpy_sync( + out_idx->data(), + &sampled_idx, + sizeof(sampled_idx), + LLAISYS_MEMCPY_H2D); +} + +} // namespace llaisys::ops diff --git a/src/ops/random_sample/op.hpp b/src/ops/random_sample/op.hpp new file mode 100644 index 000000000..900713224 --- /dev/null +++ b/src/ops/random_sample/op.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../../tensor/tensor.hpp" + +#include +#include + +namespace llaisys::ops { +void random_sample( + tensor_t out_idx, + tensor_t logits, + float temperature, + size_t top_k, + float top_p, + uint64_t seed); +} diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index bbd2e13c3..b0871adb8 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -2,8 +2,21 @@ target("llaisys-device-nvidia") set_kind("static") set_languages("cxx17") set_warnings("all", "error") - add_includedirs("/usr/local/cuda/include") - add_linkdirs("/usr/local/cuda/lib64") + local cuda_sdk = get_config("cuda") + if cuda_sdk then + local cuda_include = path.join(cuda_sdk, "include") + local cuda_lib64 = path.join(cuda_sdk, "lib64") + local cuda_lib = path.join(cuda_sdk, "lib") + if os.isdir(cuda_include) then + add_includedirs(cuda_include) + end + if os.isdir(cuda_lib64) then + add_linkdirs(cuda_lib64) + end + if os.isdir(cuda_lib) then + add_linkdirs(cuda_lib) + end + end add_links("cudart") if not is_plat("windows") then add_cxflags("-fPIC", "-Wno-unknown-pragmas") @@ -23,11 +36,24 @@ target("llaisys-ops-nvidia") set_languages("cxx17") set_warnings("all", "error") - add_includedirs("/usr/local/cuda/include") - add_linkdirs("/usr/local/cuda/lib64") + local cuda_sdk = get_config("cuda") + if cuda_sdk then + local cuda_include = path.join(cuda_sdk, "include") + local cuda_lib64 = path.join(cuda_sdk, "lib64") + local cuda_lib = path.join(cuda_sdk, "lib") + if os.isdir(cuda_include) then + add_includedirs(cuda_include) + end + if os.isdir(cuda_lib64) then + add_linkdirs(cuda_lib64) + end + if os.isdir(cuda_lib) then + add_linkdirs(cuda_lib) + end + end add_links("cudart", "cublas") if not is_plat("windows") then - add_cuflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) end add_files("../src/ops/*/nvidia/*.cu") From a4c40d2571da9eaf80e7fa0930824dcdcc773cc5 Mon Sep 17 00:00:00 2001 From: mengyijia49 Date: Mon, 16 Mar 2026 16:28:30 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=A4=9A=E4=BD=99?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/check_exports.py | 7 --- scripts/smoke_generate.py | 106 -------------------------------------- scripts/smoke_qwen2.py | 54 ------------------- scripts/test_ctypes.py | 15 ------ 4 files changed, 182 deletions(-) delete mode 100644 scripts/check_exports.py delete mode 100644 scripts/smoke_generate.py delete mode 100644 scripts/smoke_qwen2.py delete mode 100644 scripts/test_ctypes.py diff --git a/scripts/check_exports.py b/scripts/check_exports.py deleted file mode 100644 index 9e5b03549..000000000 --- a/scripts/check_exports.py +++ /dev/null @@ -1,7 +0,0 @@ -import ctypes -lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll') -print('handle=', lib._handle) -GetProcAddress=ctypes.windll.kernel32.GetProcAddress -print('create=', GetProcAddress(lib._handle, b'llaisysQwen2ModelCreate')) -print('tensor=', GetProcAddress(lib._handle, b'tensorCreate')) -print('done') diff --git a/scripts/smoke_generate.py b/scripts/smoke_generate.py deleted file mode 100644 index 6de800534..000000000 --- a/scripts/smoke_generate.py +++ /dev/null @@ -1,106 +0,0 @@ -import ctypes -from ctypes import c_size_t, c_int, c_int64, c_void_p, c_char_p, POINTER -import numpy as np -import sys - -DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll' -print('Loading DLL', DLL) -lib = ctypes.CDLL(DLL) - -# prototypes -lib.tensorCreate.argtypes = [POINTER(c_size_t), c_size_t, c_int, c_int, c_int] -lib.tensorCreate.restype = c_void_p -lib.tensorLoad.argtypes = [c_void_p, c_void_p] -lib.tensorLoad.restype = None -lib.tensorDestroy.argtypes = [c_void_p] -lib.tensorDestroy.restype = None - -lib.llaisysQwen2ModelCreate.argtypes = [POINTER(ctypes.c_void_p), c_int, POINTER(c_int), c_int] -# We'll not use this prototype; define a simpler one matching earlier header -# But to be safe use c_void_p for meta pointer by building a struct in Python not trivial; instead use direct Create with bytes - -# Define meta struct layout in Python using ctypes -class Meta(ctypes.Structure): - _fields_ = [ - ('dtype', c_int), - ('nlayer', c_size_t), - ('hs', c_size_t), - ('nh', c_size_t), - ('nkvh', c_size_t), - ('dh', c_size_t), - ('di', c_size_t), - ('maxseq', c_size_t), - ('voc', c_size_t), - ('epsilon', ctypes.c_float), - ('theta', ctypes.c_float), - ('end_token', c_int64), - ] - -lib.llaisysQwen2ModelCreate.argtypes = [POINTER(Meta), c_int, POINTER(c_int), c_int] -lib.llaisysQwen2ModelCreate.restype = c_void_p -lib.llaisysQwen2ModelSetWeight.argtypes = [c_void_p, c_char_p, c_void_p] -lib.llaisysQwen2ModelSetWeight.restype = c_int -lib.llaisysQwen2ModelFinalize.argtypes = [c_void_p] -lib.llaisysQwen2ModelFinalize.restype = c_int -lib.llaisysQwen2ModelInfer.argtypes = [c_void_p, POINTER(c_int64), c_size_t] -lib.llaisysQwen2ModelInfer.restype = c_int64 - -# Create small meta -meta = Meta() -meta.dtype = 13 # F32 -meta.nlayer = 0 # no layers to avoid accessing per-layer weights -meta.hs = 16 -meta.nh = 4 -meta.nkvh = 4 -meta.dh = 4 -meta.di = 64 -meta.maxseq = 128 -meta.voc = 100 -meta.epsilon = 1e-5 -meta.theta = 1.0 -meta.end_token = -1 - -print('Creating model') -model = lib.llaisysQwen2ModelCreate(ctypes.byref(meta), 0, None, 0) -if not model: - print('Model create failed', file=sys.stderr) - sys.exit(2) -print('Model ptr', model) - -# create in_embed tensor shape [voc, hs] -voc = int(meta.voc) -hs = int(meta.hs) -shape = (c_size_t * 2)(voc, hs) -emb_tensor = lib.tensorCreate(shape, 2, 13, 0, 0) -# fill with random floats -arr = (np.random.rand(voc, hs).astype(np.float32)).ctypes -lib.tensorLoad(emb_tensor, arr.data) -print('in_embed created') - -# create out_embed tensor shape [voc, hs] -shape2 = (c_size_t * 2)(voc, hs) -out_tensor = lib.tensorCreate(shape2, 2, 13, 0, 0) -arr2 = (np.random.rand(hs, voc).astype(np.float32)).ctypes -lib.tensorLoad(out_tensor, arr2.data) -print('out_embed created') - -# set weights -ret = lib.llaisysQwen2ModelSetWeight(model, b'embed_tokens.weight', emb_tensor) -print('set in_embed', ret) -ret = lib.llaisysQwen2ModelSetWeight(model, b'lm_head.weight', out_tensor) -print('set out_embed', ret) - -# finalize -lib.llaisysQwen2ModelFinalize(model) -print('finalized') - -# infer on token ids [1,2,3] -seq = (c_int64 * 3)(1, 2, 3) -nexttok = lib.llaisysQwen2ModelInfer(model, seq, 3) -print('next token ->', nexttok) - -# cleanup -lib.tensorDestroy(emb_tensor) -lib.tensorDestroy(out_tensor) -lib.llaisysQwen2ModelDestroy(ctypes.c_void_p(model)) -print('done') diff --git a/scripts/smoke_qwen2.py b/scripts/smoke_qwen2.py deleted file mode 100644 index aacc7b285..000000000 --- a/scripts/smoke_qwen2.py +++ /dev/null @@ -1,54 +0,0 @@ -import ctypes -from ctypes import Structure, POINTER, c_size_t, c_int, c_int64, c_float, c_void_p -import sys - -DLL = r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll' -print('Loading DLL:', DLL) -lib = ctypes.CDLL(DLL) - -class LlaisysQwen2Meta(Structure): - _fields_ = [ - ('dtype', c_int), - ('nlayer', c_size_t), - ('hs', c_size_t), - ('nh', c_size_t), - ('nkvh', c_size_t), - ('dh', c_size_t), - ('di', c_size_t), - ('maxseq', c_size_t), - ('voc', c_size_t), - ('epsilon', c_float), - ('theta', c_float), - ('end_token', c_int64), - ] - -# prototypes -lib.llaisysQwen2ModelCreate.argtypes = [POINTER(LlaisysQwen2Meta), c_int, POINTER(c_int), c_int] -lib.llaisysQwen2ModelCreate.restype = c_void_p -lib.llaisysQwen2ModelDestroy.argtypes = [c_void_p] -lib.llaisysQwen2ModelDestroy.restype = None - -m = LlaisysQwen2Meta() -m.dtype = 13 # LLAISYS_DTYPE_F32 -m.nlayer = 1 -m.hs = 16 -m.nh = 4 -m.nkvh = 4 -m.dh = 4 -m.di = 64 -m.maxseq = 128 -m.voc = 1000 -m.epsilon = 1e-5 -m.theta = 1.0 -m.end_token = -1 - -print('Calling ll_create...') -model = lib.llaisysQwen2ModelCreate(ctypes.byref(m), 0, None, 0) -print('ll_create returned:', model) -if not model: - print('create failed', file=sys.stderr) - sys.exit(2) - -print('Calling ll_destroy...') -lib.llaisysQwen2ModelDestroy(model) -print('destroy ok') diff --git a/scripts/test_ctypes.py b/scripts/test_ctypes.py deleted file mode 100644 index b236c22bd..000000000 --- a/scripts/test_ctypes.py +++ /dev/null @@ -1,15 +0,0 @@ -import ctypes -lib=ctypes.CDLL(r'D:\infinitensor\tuili\hw3\llaisys\python\llaisys\llaisys.dll') -print('has create attr:', hasattr(lib, 'llaisysQwen2ModelCreate')) -try: - f = lib.llaisysQwen2ModelCreate - print('got create:', f) -except Exception as e: - print('error getting create:', e) - -print('has tensorCreate:', hasattr(lib, 'tensorCreate')) -try: - f2 = lib.tensorCreate - print('got tensorCreate:', f2) -except Exception as e: - print('error getting tensorCreate:', e) From 8ddebee5af27312e6df0925314c2a974c5829aed Mon Sep 17 00:00:00 2001 From: mengyijia49 Date: Mon, 16 Mar 2026 16:45:01 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=9B=9E=E9=80=80?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/llaisys/models/qwen2.py | 148 ++++++++++++++------------------- 1 file changed, 62 insertions(+), 86 deletions(-) diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 6d840ce1b..bd9aa4546 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -12,12 +12,6 @@ except Exception: torch = None -try: - from transformers import AutoModelForCausalLM - HF_AVAILABLE = True -except Exception: - HF_AVAILABLE = False - class Qwen2: def __init__(self, model_path, device: DeviceType = DeviceType.CPU): @@ -119,16 +113,7 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU): if missing: print("[llaisys qwen2] Warning: missing weights:", missing) except Exception as e: - # backend unavailable or error during loading; fall back to HF - print(f"[llaisys qwen2] backend load failed: {e}") - self._backend_model = None - - if self._backend_model is None: - if not HF_AVAILABLE: - raise RuntimeError("Neither backend nor HuggingFace available for Qwen2 model") - self.device = torch.device("cpu" if device == DeviceType.CPU else ("cuda" if torch.cuda.is_available() else "cpu")) - self.model = AutoModelForCausalLM.from_pretrained(str(model_path), trust_remote_code=True, torch_dtype=torch.bfloat16) - self.model.to(self.device) + raise RuntimeError(f"[llaisys qwen2] backend load failed: {e}") from e def generate( self, @@ -139,83 +124,74 @@ def generate( temperature: float = 0.8, seed: int = None, ): - if self._backend_model is not None: - import ctypes - from ctypes import c_float, c_int64, c_size_t, c_uint64 - - input_ids = [int(t) for t in inputs] - if not input_ids: - return [] - - if max_new_tokens is None: - max_new_tokens = 128 - max_new_tokens = int(max_new_tokens) - if max_new_tokens <= 0: - return input_ids - - kv_cap = len(input_ids) + max_new_tokens - if self._maxseq > 0: - kv_cap = min(kv_cap, self._maxseq) - kv_cap = max(kv_cap, 1) - - if self._backend_kv is not None: - LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv) - self._backend_kv = None - self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap)) - - sample_api_available = hasattr(LIB_LLAISYS, "llaisysQwen2ModelInferSample") - sample_top_k = max(0, int(top_k)) - sample_top_p = float(top_p) - sample_temperature = float(temperature) - rng = random.Random() if seed is None else random.Random(int(seed)) - - def infer_with_sampling(token_buffer, n_token): - if sample_api_available: - step_seed = rng.getrandbits(64) - return int( - LIB_LLAISYS.llaisysQwen2ModelInferSample( - self._backend_model, - token_buffer, - c_size_t(n_token), - c_float(sample_temperature), - c_size_t(sample_top_k), - c_float(sample_top_p), - c_uint64(step_seed), - ) - ) + if self._backend_model is None: + raise RuntimeError("Qwen2 backend is not initialized") + + import ctypes + from ctypes import c_float, c_int64, c_size_t, c_uint64 + + input_ids = [int(t) for t in inputs] + if not input_ids: + return [] + + if max_new_tokens is None: + max_new_tokens = 128 + max_new_tokens = int(max_new_tokens) + if max_new_tokens <= 0: + return input_ids + + kv_cap = len(input_ids) + max_new_tokens + if self._maxseq > 0: + kv_cap = min(kv_cap, self._maxseq) + kv_cap = max(kv_cap, 1) + + if self._backend_kv is not None: + LIB_LLAISYS.llaisysQwen2KVDestroy(self._backend_kv) + self._backend_kv = None + self._backend_kv = LIB_LLAISYS.llaisysQwen2KVCreat(self._backend_model, c_size_t(kv_cap)) + + sample_api_available = hasattr(LIB_LLAISYS, "llaisysQwen2ModelInferSample") + sample_top_k = max(0, int(top_k)) + sample_top_p = float(top_p) + sample_temperature = float(temperature) + rng = random.Random() if seed is None else random.Random(int(seed)) + + def infer_with_sampling(token_buffer, n_token): + if sample_api_available: + step_seed = rng.getrandbits(64) return int( - LIB_LLAISYS.llaisysQwen2ModelInfer( + LIB_LLAISYS.llaisysQwen2ModelInferSample( self._backend_model, token_buffer, c_size_t(n_token), + c_float(sample_temperature), + c_size_t(sample_top_k), + c_float(sample_top_p), + c_uint64(step_seed), ) ) - - arr = (c_int64 * len(input_ids))(*input_ids) - next_token = infer_with_sampling(arr, len(input_ids)) - - output_ids = list(input_ids) - for _ in range(max_new_tokens): - if next_token is None: - break - output_ids.append(next_token) - if self._end_token >= 0 and next_token == self._end_token: - break - arr = (c_int64 * 1)(next_token) - next_token = infer_with_sampling(arr, 1) - - return output_ids - - input_ids = torch.tensor([list(inputs)], dtype=torch.long, device=self.device) - with torch.no_grad(): - outputs = self.model.generate( - input_ids, - max_new_tokens=max_new_tokens, - top_k=top_k, - top_p=top_p, - temperature=temperature, + return int( + LIB_LLAISYS.llaisysQwen2ModelInfer( + self._backend_model, + token_buffer, + c_size_t(n_token), + ) ) - return outputs[0].tolist() + + arr = (c_int64 * len(input_ids))(*input_ids) + next_token = infer_with_sampling(arr, len(input_ids)) + + output_ids = list(input_ids) + for _ in range(max_new_tokens): + if next_token is None: + break + output_ids.append(next_token) + if self._end_token >= 0 and next_token == self._end_token: + break + arr = (c_int64 * 1)(next_token) + next_token = infer_with_sampling(arr, 1) + + return output_ids def __del__(self): try: