Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ __C {
struct LlaisysQwen2Weights {
llaisysTensor_t in_embed;
llaisysTensor_t out_embed;
llaisysTensor_t out_norm_w; // a.k.a. model.norm.weight
llaisysTensor_t *attn_norm_w; // a.k.a. input_layernorm.weight
llaisysTensor_t out_norm_w;
llaisysTensor_t *attn_norm_w;
llaisysTensor_t *attn_q_w;
llaisysTensor_t *attn_q_b;
llaisysTensor_t *attn_k_w;
llaisysTensor_t *attn_k_b;
llaisysTensor_t *attn_v_w;
llaisysTensor_t *attn_v_b;
llaisysTensor_t *attn_o_w;
llaisysTensor_t *mlp_norm_w; // a.k.a. post_attention_layernorm.weight
llaisysTensor_t *mlp_norm_w;
llaisysTensor_t *mlp_gate_w;
llaisysTensor_t *mlp_up_w;
llaisysTensor_t *mlp_down_w;
Expand Down
10 changes: 6 additions & 4 deletions python/llaisys/libllaisys/llaisys_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from enum import IntEnum


# Device Type enum
class DeviceType(IntEnum):
CPU = 0
NVIDIA = 1
Expand All @@ -12,7 +11,6 @@ class DeviceType(IntEnum):
llaisysDeviceType_t = ctypes.c_int


# Data Type enum
class DataType(IntEnum):
INVALID = 0
BYTE = 1
Expand All @@ -39,7 +37,6 @@ class DataType(IntEnum):
llaisysDataType_t = ctypes.c_int


# Memory Copy Kind enum
class MemcpyKind(IntEnum):
H2H = 0
H2D = 1
Expand All @@ -48,8 +45,13 @@ class MemcpyKind(IntEnum):


llaisysMemcpyKind_t = ctypes.c_int
llaisysTensor_t = ctypes.c_void_p

class LlaisysQwen2Model(ctypes.Structure):
pass
llaisysQwen2ModelHandle = ctypes.POINTER(LlaisysQwen2Model)
llaisysQwen2Weights_p = ctypes.c_void_p

# Stream type (opaque pointer)
llaisysStream_t = ctypes.c_void_p

__all__ = [
Expand Down
42 changes: 42 additions & 0 deletions python/llaisys/libllaisys/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import ctypes
from .llaisys_types import llaisysDataType_t, llaisysTensor_t, llaisysDeviceType_t

class LlaisysQwen2Meta(ctypes.Structure):
_fields_ = [
("dtype", llaisysDataType_t),
("nlayer", ctypes.c_size_t),
("hs", ctypes.c_size_t),
("nh", ctypes.c_size_t),
("nkvh", ctypes.c_size_t),
("dh", ctypes.c_size_t),
("di", ctypes.c_size_t),
("maxseq", ctypes.c_size_t),
("voc", ctypes.c_size_t),
("epsilon", ctypes.c_float),
("theta", ctypes.c_float),
("end_token", ctypes.c_int64),
]

class LlaisysQwen2Weights(ctypes.Structure):
_fields_ = [
("in_embed", llaisysTensor_t),
("out_embed", llaisysTensor_t),
("out_norm_w", llaisysTensor_t),
("attn_norm_w", ctypes.POINTER(llaisysTensor_t)),
("attn_q_w", ctypes.POINTER(llaisysTensor_t)),
("attn_q_b", ctypes.POINTER(llaisysTensor_t)),
("attn_k_w", ctypes.POINTER(llaisysTensor_t)),
("attn_k_b", ctypes.POINTER(llaisysTensor_t)),
("attn_v_w", ctypes.POINTER(llaisysTensor_t)),
("attn_v_b", ctypes.POINTER(llaisysTensor_t)),
("attn_o_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_norm_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_gate_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_up_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_down_w", ctypes.POINTER(llaisysTensor_t)),
]

class LlaisysQwen2Model(ctypes.Structure):
pass

llaisysQwen2ModelHandle = ctypes.POINTER(LlaisysQwen2Model)
195 changes: 177 additions & 18 deletions python/llaisys/models/qwen2.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,192 @@
from typing import Sequence
from ..libllaisys import LIB_LLAISYS
from ..libllaisys import DeviceType

import ctypes
import json
import os
import mmap
import struct
from typing import List, Dict, Optional, Sequence, Any
import numpy as np
from pathlib import Path
import safetensors

from ..libllaisys import LIB_LLAISYS, llaisysTensor_t, llaisysDataType_t, llaisysDeviceType_t, DataType, DeviceType
from ..libllaisys.models import LlaisysQwen2Meta, LlaisysQwen2Weights, LlaisysQwen2Model, llaisysQwen2ModelHandle
from ..tensor import Tensor

class Qwen2:
LIB_LLAISYS.llaisysQwen2ModelCreate.argtypes = [ctypes.POINTER(LlaisysQwen2Meta), llaisysDeviceType_t, ctypes.POINTER(ctypes.c_int), ctypes.c_int]
LIB_LLAISYS.llaisysQwen2ModelCreate.restype = llaisysQwen2ModelHandle

LIB_LLAISYS.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2ModelHandle]
LIB_LLAISYS.llaisysQwen2ModelDestroy.restype = None

LIB_LLAISYS.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2ModelHandle]
LIB_LLAISYS.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights)

def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
# TODO: Implement model constructor
LIB_LLAISYS.llaisysQwen2ModelInfer.argtypes = [llaisysQwen2ModelHandle, ctypes.POINTER(ctypes.c_int64), ctypes.c_size_t]
LIB_LLAISYS.llaisysQwen2ModelInfer.restype = ctypes.c_int64


class Qwen2:
def __init__(self, model_path: str, device: DeviceType = DeviceType.CPU, device_id: int = 0):
model_path = Path(model_path)
config_path = model_path / "config.json"

with open(config_path, "r") as f:
config = json.load(f)

self.device = device
self.device_id = device_id

self.meta = LlaisysQwen2Meta()

dtype_str = config.get("torch_dtype", "float32")
self.meta.dtype = DataType.F32

self.meta.nlayer = config.get("num_hidden_layers", 24)
self.meta.hs = config.get("hidden_size", 2048)
self.meta.nh = config.get("num_attention_heads", 16)
self.meta.nkvh = config.get("num_key_value_heads", 16)
self.meta.dh = self.meta.hs // self.meta.nh
self.meta.di = config.get("intermediate_size", 11008)
self.meta.maxseq = config.get("max_position_embeddings", 8192)
self.meta.voc = config.get("vocab_size", 151936)
self.meta.epsilon = config.get("rms_norm_eps", 1e-6)
self.meta.theta = config.get("rope_theta", 1000000.0)
self.meta.end_token = 151643 # Placeholder

dev_ids = (ctypes.c_int * 1)(device_id)
self.handle = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(self.meta), device, dev_ids, 1)
self.weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self.handle)
self.tensors_ref = []

for file in sorted(model_path.glob("*.safetensors")):
data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
for name_ in data_.keys():
## TODO: load the model weights
pass
print(f"Loading weights from {file}...")
weights_data = self._load_safetensors_bf16_as_f32(file)
for key, arr in weights_data.items():
if not arr.flags['C_CONTIGUOUS']:
arr = np.ascontiguousarray(arr)

t = Tensor(list(arr.shape), self.meta.dtype, device, device_id)
t.load(ctypes.c_void_p(arr.ctypes.data))

self._assign_weight(key, t)

def _load_safetensors_bf16_as_f32(self, path: Path) -> Dict[str, np.ndarray]:
tensors = {}
with open(path, 'rb') as f:
length_bytes = f.read(8)
if not length_bytes: return {}
header_size = struct.unpack('<Q', length_bytes)[0]

header_bytes = f.read(header_size)
header = json.loads(header_bytes)

fileno = f.fileno()
total_size = os.fstat(fileno).st_size
mm = mmap.mmap(fileno, total_size, access=mmap.ACCESS_READ)

data_start = 8 + header_size

for key, info in header.items():
if key == "__metadata__": continue

dtype_str = info['dtype']
shape = info['shape']
start, end = info['data_offsets']

abs_start = data_start + start

if dtype_str == "BF16" or dtype_str == "bfloat16":
raw_u16 = np.frombuffer(mm, dtype=np.uint16, count=(end-start)//2, offset=abs_start)
u32 = raw_u16.astype(np.uint32) << 16
del raw_u16
arr = u32.view(np.float32).reshape(shape)
tensors[key] = arr
elif dtype_str == "F32" or dtype_str == "float32":
raw_f32 = np.frombuffer(mm, dtype=np.float32, count=(end-start)//4, offset=abs_start)
arr = np.array(raw_f32).reshape(shape)
del raw_f32
tensors[key] = arr
else:
if dtype_str == "F16" or dtype_str == "float16":
raw_f16 = np.frombuffer(mm, dtype=np.float16, count=(end-start)//2, offset=abs_start)
arr = raw_f16.astype(np.float32).reshape(shape)
del raw_f16
tensors[key] = arr

mm.close()

return tensors

def _assign_weight(self, name: str, t: Tensor):
w = self.weights_ptr.contents
self.tensors_ref.append(t)

if name == "model.embed_tokens.weight":
w.in_embed = t.lib_tensor()
elif name == "lm_head.weight":
w.out_embed = t.lib_tensor()
elif name == "model.norm.weight":
w.out_norm_w = t.lib_tensor()
elif name.startswith("model.layers."):
parts = name.split(".")
layer_idx = int(parts[2])
suffix = ".".join(parts[3:])

def set_w(target_ptr):
target_ptr[layer_idx] = t.lib_tensor()

if suffix == "input_layernorm.weight":
set_w(w.attn_norm_w)
elif suffix == "self_attn.q_proj.weight":
set_w(w.attn_q_w)
elif suffix == "self_attn.q_proj.bias":
set_w(w.attn_q_b)
elif suffix == "self_attn.k_proj.weight":
set_w(w.attn_k_w)
elif suffix == "self_attn.k_proj.bias":
set_w(w.attn_k_b)
elif suffix == "self_attn.v_proj.weight":
set_w(w.attn_v_w)
elif suffix == "self_attn.v_proj.bias":
set_w(w.attn_v_b)
elif suffix == "self_attn.o_proj.weight":
set_w(w.attn_o_w)
elif suffix == "post_attention_layernorm.weight":
set_w(w.mlp_norm_w)
elif suffix == "mlp.gate_proj.weight":
set_w(w.mlp_gate_w)
elif suffix == "mlp.up_proj.weight":
set_w(w.mlp_up_w)
elif suffix == "mlp.down_proj.weight":
set_w(w.mlp_down_w)

def __del__(self):
if hasattr(self, "handle") and self.handle:
LIB_LLAISYS.llaisysQwen2ModelDestroy(self.handle)

def generate(
self,
inputs: Sequence[int],
max_new_tokens: int = None,
max_new_tokens: int = 20,
top_k: int = 1,
top_p: float = 0.8,
temperature: float = 0.8,
):

# TODO: Implement generate function

return []
) -> List[int]:

generated = []
tokens = list(inputs)

arr = (ctypes.c_int64 * len(tokens))(*tokens)
next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(self.handle, arr, len(tokens))
generated.append(next_token)
tokens = [next_token]

for _ in range(max_new_tokens - 1):
arr = (ctypes.c_int64 * 1)(*tokens)
next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(self.handle, arr, 1)
generated.append(next_token)
tokens = [next_token]

if next_token == self.meta.end_token:
break

return list(inputs) + generated
31 changes: 31 additions & 0 deletions src/llaisys/models/qwen2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "llaisys/models/qwen2.h"
#include "../../models/qwen2/model.hpp"

using namespace llaisys::models::qwen2;

extern "C" {

struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) {
int dev_id = (ndevice > 0 && device_ids != nullptr) ? device_ids[0] : 0;
Qwen2Model* model = new Qwen2Model(*meta, device, dev_id);
return reinterpret_cast<LlaisysQwen2Model*>(model);
}

void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) {
if (model) {
delete reinterpret_cast<Qwen2Model*>(model);
}
}

struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) {
if (!model) return nullptr;
return reinterpret_cast<Qwen2Model*>(model)->getWeightsStruct();
}

int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) {
if (!model) return -1;
std::vector<int64_t> tokens(token_ids, token_ids + ntoken);
return reinterpret_cast<Qwen2Model*>(model)->infer(tokens);
}

}
36 changes: 36 additions & 0 deletions src/llaisys/qwen2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "llaisys/models/qwen2.h"
#include "../models/qwen2/qwen2.hpp"

extern "C" {

struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) {
if (!meta || ndevice < 1) return nullptr;
// For now support single device
int device_id = device_ids ? device_ids[0] : 0;

// Copy meta
LlaisysQwen2Meta cpp_meta = *meta;

auto* model = new llaisys::Qwen2Model(cpp_meta, device, device_id);
return reinterpret_cast<struct LlaisysQwen2Model*>(model);
}

void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) {
if (model) {
delete reinterpret_cast<llaisys::Qwen2Model*>(model);
}
}

struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) {
if (!model) return nullptr;
auto* cpp_model = reinterpret_cast<llaisys::Qwen2Model*>(model);
return cpp_model->getWeights();
}

int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) {
if (!model) return -1;
auto* cpp_model = reinterpret_cast<llaisys::Qwen2Model*>(model);
return cpp_model->infer(token_ids, ntoken);
}

}
Loading