Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#models
/models/
# Xmake cache
.xmake/
build/

# md docs
*.md
# Binaries
bin/
lib/
Expand Down Expand Up @@ -87,4 +90,4 @@ htmlcov/
# Windows
Thumbs.db
ehthumbs.db
desktop.ini
desktop.ini
2 changes: 2 additions & 0 deletions =3.1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: jinja2 in /usr/lib/python3/dist-packages (3.0.3)
3 changes: 2 additions & 1 deletion include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ __C {

__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
float temperature, size_t topK, float topP, int64_t seed);
}
#endif // LLAISYS_MODELS_QWEN2_H
2 changes: 2 additions & 0 deletions include/llaisys/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
__C {
__export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b);
__export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals);
__export void llaisysRandSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t vals,
float temperature, size_t topK, float topP, int64_t seed);
__export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight);
__export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias);
__export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in);
Expand Down
9 changes: 9 additions & 0 deletions python/llaisys/libllaisys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from .tensor import llaisysTensor_t
from .tensor import load_tensor
from .ops import load_ops
# Llaisys infer
from .models import load_models
from .models import LlaisysQwen2Meta
from .models import LlaisysQwen2Weights


def load_shared_library():
Expand All @@ -38,6 +42,8 @@ def load_shared_library():
load_runtime(LIB_LLAISYS)
load_tensor(LIB_LLAISYS)
load_ops(LIB_LLAISYS)
# Llaisys load_models
load_models(LIB_LLAISYS)


__all__ = [
Expand All @@ -52,4 +58,7 @@ def load_shared_library():
"llaisysMemcpyKind_t",
"MemcpyKind",
"llaisysStream_t",
# Llaisys c side
"LlaisysQwen2Meta",
"LlaisysQwen2Weights",
]
69 changes: 69 additions & 0 deletions python/llaisys/libllaisys/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import ctypes
from ctypes import c_size_t, c_int64, c_int, c_float
from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t
from .tensor import llaisysTensor_t

# c side wrap

class LlaisysQwen2Meta(ctypes.Structure):
_fields_ = [
("dtype", llaisysDataType_t),
("nlayer", c_size_t),
("hs", c_size_t),
("nh", c_size_t),
("nkvh", c_size_t),
("dh", c_size_t),
("di", c_size_t),
("maxseq", c_size_t),
("voc", c_size_t),
("epsilon", c_float),
("theta", c_float),
("end_token", c_int64),
]


class LlaisysQwen2Weights(ctypes.Structure):
_fields_ = [
("in_embed", llaisysTensor_t),
("out_embed", llaisysTensor_t),
("out_norm_w", llaisysTensor_t),
("attn_norm_w", ctypes.POINTER(llaisysTensor_t)),
("attn_q_w", ctypes.POINTER(llaisysTensor_t)),
("attn_q_b", ctypes.POINTER(llaisysTensor_t)),
("attn_k_w", ctypes.POINTER(llaisysTensor_t)),
("attn_k_b", ctypes.POINTER(llaisysTensor_t)),
("attn_v_w", ctypes.POINTER(llaisysTensor_t)),
("attn_v_b", ctypes.POINTER(llaisysTensor_t)),
("attn_o_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_norm_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_gate_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_up_w", ctypes.POINTER(llaisysTensor_t)),
("mlp_down_w", ctypes.POINTER(llaisysTensor_t)),
]


def load_models(lib):
lib.llaisysQwen2ModelCreate.argtypes = [
ctypes.POINTER(LlaisysQwen2Meta),
llaisysDeviceType_t,
ctypes.POINTER(c_int),
c_int,
]
lib.llaisysQwen2ModelCreate.restype = ctypes.c_void_p

lib.llaisysQwen2ModelDestroy.argtypes = [ctypes.c_void_p]
lib.llaisysQwen2ModelDestroy.restype = None

lib.llaisysQwen2ModelWeights.argtypes = [ctypes.c_void_p]
lib.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights)

lib.llaisysQwen2ModelInfer.argtypes = [
ctypes.c_void_p,
ctypes.POINTER(c_int64),
c_size_t,
c_float,
c_size_t,
c_float,
c_int64,
]
lib.llaisysQwen2ModelInfer.restype = c_int64
13 changes: 12 additions & 1 deletion python/llaisys/libllaisys/ops.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .tensor import llaisysTensor_t
from ctypes import c_float
from ctypes import c_float, c_int64, c_size_t

def load_ops(lib):
lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
Expand All @@ -8,6 +8,17 @@ def load_ops(lib):
lib.llaisysArgmax.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
lib.llaisysArgmax.restype = None

lib.llaisysRandSample.argtypes = [
llaisysTensor_t,
llaisysTensor_t,
llaisysTensor_t,
c_float,
c_size_t,
c_float,
c_int64,
]
lib.llaisysRandSample.restype = None

lib.llaisysEmbedding.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
lib.llaisysEmbedding.restype = None

Expand Down
188 changes: 178 additions & 10 deletions python/llaisys/models/qwen2.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,177 @@
from typing import Sequence
from ..libllaisys import LIB_LLAISYS
from ..libllaisys import DeviceType
from ..libllaisys import DataType
from ..libllaisys import llaisysDeviceType_t
from ..libllaisys.models import LlaisysQwen2Meta

from pathlib import Path
import os
import safetensors
import json
import ctypes
import torch


class Qwen2:

def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
# TODO: Implement model constructor

self._device = device
model_path = Path(model_path)
config_path = model_path / "config.json"
# read model config from config.json
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)

# parse dim
hs = int(config["hidden_size"])
nlayer = int(config["num_hidden_layers"])
nh = int(config["num_attention_heads"])
nkvh = int(config.get("num_key_value_heads", nh))
di = int(config["intermediate_size"])
dh = int(hs // nh)

# parse key params
maxseq = int(config["max_position_embeddings"])
voc = int(config["vocab_size"])
epsilon = float(config["rms_norm_eps"])
theta = float(config["rope_theta"])
end_token = int(config["eos_token_id"])


dtype = self._select_dtype(device)
self._dtype = dtype
# construct C struct LlaisysQwen2Meta
meta = LlaisysQwen2Meta(
dtype=dtype,
nlayer=nlayer,
hs=hs,
nh=nh,
nkvh=nkvh,
dh=dh,
di=di,
maxseq=maxseq,
voc=voc,
epsilon=epsilon,
theta=theta,
end_token=end_token,
)

device_ids = (ctypes.c_int * 1)(0)
# create model instance
self._model = LIB_LLAISYS.llaisysQwen2ModelCreate(
ctypes.byref(meta), llaisysDeviceType_t(device), device_ids, 1
)

# get model weights
self._weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model).contents
self._end_token = end_token

# traverse all safetensors files, in fact only one file in qwen2
for file in sorted(model_path.glob("*.safetensors")):
data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
# load on cpu, I use pt framework to load bfloat16 weights here
data_ = safetensors.safe_open(file, framework="pt", device="cpu")
for name_ in data_.keys():
## TODO: load the model weights
pass
weight = self._match_weight(name_)
if weight is None:
continue
# load weight to c side
arr = data_.get_tensor(name_)
torch_dtype = self._torch_dtype(self._dtype)
if arr.dtype != torch_dtype:
arr = arr.to(torch_dtype)
arr = arr.contiguous()

LIB_LLAISYS.tensorLoad(weight, ctypes.c_void_p(arr.data_ptr()))


def _match_weight(self, name: str):
# match weight name to c struct field

w = self._weights
# input embedding
if name == "model.embed_tokens.weight":
return w.in_embed
# output embedding
if name in ("lm_head.weight", "model.lm_head.weight"):
return w.out_embed
# final LayerNorm
if name == "model.norm.weight":
return w.out_norm_w
# only processtransformer layer weights
if not name.startswith("model.layers."):
return None
parts = name.split(".")
if len(parts) < 5:
return None
layer = int(parts[2]) # 提取层索引
tail = ".".join(parts[3:]) # 剩余后缀
# Attention Layer
if tail == "input_layernorm.weight":
return w.attn_norm_w[layer]
if tail == "self_attn.q_proj.weight":
return w.attn_q_w[layer]
if tail == "self_attn.q_proj.bias":
return w.attn_q_b[layer]
if tail == "self_attn.k_proj.weight":
return w.attn_k_w[layer]
if tail == "self_attn.k_proj.bias":
return w.attn_k_b[layer]
if tail == "self_attn.v_proj.weight":
return w.attn_v_w[layer]
if tail == "self_attn.v_proj.bias":
return w.attn_v_b[layer]
if tail == "self_attn.o_proj.weight":
return w.attn_o_w[layer]
# FFN layer
if tail == "post_attention_layernorm.weight":
return w.mlp_norm_w[layer]
if tail == "mlp.gate_proj.weight":
return w.mlp_gate_w[layer]
if tail == "mlp.up_proj.weight":
return w.mlp_up_w[layer]
if tail == "mlp.down_proj.weight":
return w.mlp_down_w[layer]
return None

def _select_dtype(self, device: DeviceType) -> DataType:
dtype_env = os.environ.get("LLAISYS_DTYPE", "").strip().lower()
if dtype_env in ("f16", "float16"):
return DataType.F16
if dtype_env in ("f32", "float32"):
return DataType.F32
if dtype_env in ("bf16", "bfloat16"):
return DataType.BF16
if device == DeviceType.NVIDIA:
return DataType.F32
return DataType.BF16

def _torch_dtype(self, dtype: DataType):
if dtype == DataType.F16:
return torch.float16
if dtype == DataType.F32:
return torch.float32
if dtype == DataType.BF16:
return torch.bfloat16
return torch.float32

def _infer(self, tokens: Sequence[int], temperature: float, top_k: int, top_p: float, seed: int) -> int:
# step forward infer

if len(tokens) == 0:
return self._end_token
# convert python list to c int64 array
arr = (ctypes.c_int64 * len(tokens))(*tokens)
return int(
LIB_LLAISYS.llaisysQwen2ModelInfer(
self._model,
arr,
ctypes.c_size_t(len(tokens)),
ctypes.c_float(temperature),
ctypes.c_size_t(top_k),
ctypes.c_float(top_p),
ctypes.c_int64(seed),
)
)

def generate(
self,
Expand All @@ -26,8 +180,22 @@ def generate(
top_k: int = 1,
top_p: float = 0.8,
temperature: float = 0.8,
seed: int = 0,
):

# TODO: Implement generate function

return []
# max new tokens default value:32
if max_new_tokens is None:
max_new_tokens = 32
tokens = list(inputs)
if max_new_tokens == 0:
return tokens
# prefill
next_token = self._infer(tokens, temperature, top_k, top_p, seed)
tokens.append(next_token)
# decode
for _ in range(max_new_tokens - 1):
if tokens[-1] == self._end_token:
break
seed += 1
next_token = self._infer([tokens[-1]], temperature, top_k, top_p, seed)
tokens.append(next_token)
return tokens
Loading
Loading