Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1126c82
hw1 load()
ArcaLunar Jan 29, 2026
302a738
hw1 iscontiguous()
ArcaLunar Jan 29, 2026
52b9d6a
hw1 view()
ArcaLunar Jan 29, 2026
ff386bf
hw1 permute()
ArcaLunar Jan 29, 2026
100fb93
hw1 slice()
ArcaLunar Jan 29, 2026
1a439b8
2.1 argmax
ArcaLunar Jan 29, 2026
8aeb26b
2.2 embedding; add openmp support
ArcaLunar Jan 29, 2026
4bb2a71
2.3 linear
ArcaLunar Jan 29, 2026
1b8576c
2.4 RMS Norm
ArcaLunar Jan 29, 2026
0663d91
2.5 RoPE
ArcaLunar Jan 29, 2026
f07019e
2.6 Self-Attn + openmp
ArcaLunar Jan 29, 2026
8435a48
2.7 SwiGLU
ArcaLunar Jan 29, 2026
139c28a
impl simple kv cache
ArcaLunar Jan 30, 2026
ae5060e
impl qwen2 model basic config functions; add config reading in Python
ArcaLunar Jan 30, 2026
273c4ef
impl weight loading
ArcaLunar Jan 30, 2026
8926d9e
impl full pipeline; fix issue: Python llaistsTensor cannot get import…
ArcaLunar Jan 30, 2026
f180950
correct until self-attn + kvcache
ArcaLunar Jan 30, 2026
82b0967
add naive attn
ArcaLunar Jan 30, 2026
100d722
pipeline ok, wrong output
ArcaLunar Jan 30, 2026
69d4758
prefill ok, decoding issue
ArcaLunar Jan 30, 2026
71403f1
fp32 first token ok, rest boom
ArcaLunar Jan 30, 2026
5f6b405
all prefilling ok
ArcaLunar Jan 30, 2026
71bc41a
use all prefilling first
ArcaLunar Jan 30, 2026
c81db06
fix size_t to int convertion on windows
ArcaLunar Jan 30, 2026
91b7581
fix decoding issue: improper offset in simple.cc (begin .... elementS…
ArcaLunar Jan 30, 2026
e199798
feat: nvidia runtime api
ArcaLunar Mar 6, 2026
bbc1979
feat+build: add cuda ops and update build
ArcaLunar Mar 7, 2026
025ceac
feat: implement cuda kernels for all ops
ArcaLunar Mar 7, 2026
571ca4c
format
ArcaLunar Mar 7, 2026
2253fd8
feat: impl kvcache with unified context
ArcaLunar Mar 7, 2026
90e3180
fix: qwen2.cc directly unrefs device-side ptr
ArcaLunar Mar 7, 2026
db5d5d2
feat: sampling operator, topk, topp, temperature
ArcaLunar Mar 10, 2026
ab611e6
feat: integrate sampling into model definition
ArcaLunar Mar 10, 2026
77ff1c5
feat: chat API and server
ArcaLunar Mar 10, 2026
d070eec
fix: chat server port issue
ArcaLunar Mar 10, 2026
896da9e
fix: logits on nvidua instead of cpu before sampling
ArcaLunar Mar 10, 2026
b553cb3
feat: add streaming output
ArcaLunar Mar 12, 2026
e624d05
feat: chatting tui
ArcaLunar Mar 15, 2026
8dd1458
update REPORT.md
ArcaLunar Mar 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 4 additions & 22 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,9 @@ IndentWidth: 4 # 缩进宽度,LLVM 默认值为 2,改
AccessModifierOffset: -4 # public/protected/private 访问控制符相对成员的偏移,与 IndentWidth 配合,LLVM 默认值为 -2
AlignOperands: AlignAfterOperator # 双目运算符的行间对齐,LLVM 默认值为 Align,改为带符号一起换行
BreakBeforeBinaryOperators: All # 在双目运算符之前换行,LLVM 默认值为 None,改为换行时总是把双目运算符放在行首,包括赋值(=)
ColumnLimit: 0 # 列宽限制,LLVM 默认值为 80,改为不限制
ColumnLimit: 80 # 列宽限制,LLVM 默认值为 80,改为不限制
AllowShortBlocksOnASingleLine: Always # 是否允许短块(单个语句的块)不换行,LLVM 默认值为 Never,改为允许
AllowShortLoopsOnASingleLine: true # 是否允许短循环不换行,LLVM 默认值为 false,改为允许
InsertBraces: true # 是否在 if/for/while/switch 等语句后插入大括号,LLVM 默认值为 false,改为允许
BreakBeforeBraces: Custom # 大括号换行配置,LLVM 默认值为 LLVM,改为自定义以使 BraceWrapping 生效
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
InsertBraces: false # 是否在 if/for/while/switch 等语句后插入大括号,LLVM 默认值为 false,改为允许

BinPackParameters: OnePerLine
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,6 @@ htmlcov/
# Windows
Thumbs.db
ehthumbs.db
desktop.ini
desktop.ini

data
25 changes: 25 additions & 0 deletions REPORT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
## CUDA Backend for Chat Server

首先设置环境变量 `LLAISYS_DEVICE=nvidia` 来启用 CUDA backend.

## 流式输出

然后运行 `python/chat_server.py` 启用 OpenAI 风格的 API

然后可以用 `curl` 来测试流式输出

```bash
curl -N http://127.0.0.1:9108/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Accept: text/event-stream" \
-d '{"model":"qwen2","messages":[{"role":"user","content":"Hi who are you?"}],"stream":true,"max_tokens":64,"temperature":0.8,"top_p":0.9,"top_k":40}'
```

### TUI Chatting

```bash
python -m llaisys.chat.tui --url http://127.0.0.1:9108

# 或者用 uv
uv run python -m llaisys.chat.tui --url http://127.0.0.1:9108
```
39 changes: 36 additions & 3 deletions include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#define LLAISYS_MODELS_QWEN2_H

#include "../tensor.h"
#include <cstdint>
#include <string>

__C {
struct LlaisysQwen2Meta {
Expand Down Expand Up @@ -31,12 +33,43 @@ __C {

struct LlaisysQwen2Model;

__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(
const LlaisysQwen2Meta *meta, llaisysDeviceType_t device,
int *device_ids, int ndevice);

__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);

__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(
struct LlaisysQwen2Model * model);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
/**
* @brief Inference function for Qwen2 Model. This function combines both
* prefill and decode through `prefill` flag.
* @note This function will reset KV Caches if `prefill` is true.
*
* @param token_ids input token ids
* @param pos_ids input position ids, used for RoPE
*/
__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model,
int64_t *token_ids, int64_t *pos_ids, size_t ntoken, bool prefill);

/**
* @brief Inference with configurable decoding strategy.
*
* This behaves the same as llaisysQwen2ModelInfer for prefill/decode flow,
* but selects the next token using temperature/top-k/top-p sampling.
*
* @param top_k Keep only top-k tokens before sampling (0 = disabled).
* @param top_p Nucleus threshold in (0, 1] (1.0 = disabled).
* @param temperature Positive temperature scalar.
*/
__export int64_t llaisysQwen2ModelInferSample(struct LlaisysQwen2Model * model,
int64_t *token_ids, int64_t *pos_ids,
size_t ntoken, bool prefill,
int top_k, float top_p, float temperature);

__export void llaisysQwen2SetWeights(struct LlaisysQwen2Model * model,
int name, int layer_id,
llaisysTensor_t tensor);
}
#endif // LLAISYS_MODELS_QWEN2_H
14 changes: 14 additions & 0 deletions include/llaisys/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ __C {
__export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
__export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
__export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);

/**
* @brief Sample a token index from logits with temperature / top-k / top-p.
* @param out Shape {1}, dtype int64. Receives the sampled token index.
* @param logits Shape {vocab_size}, any float dtype. Must be contiguous.
* @param top_k Keep only top-k tokens (0 = disabled).
* @param top_p Nucleus threshold in (0, 1] (1.0 = disabled).
* @param temperature Positive temperature scalar.
*/
__export void llaisysSample(llaisysTensor_t out, llaisysTensor_t logits,
int top_k, float top_p, float temperature);

/** Set the per-thread RNG seed used by llaisysSample. */
__export void llaisysSampleSetSeed(uint64_t seed);
}

#endif
13 changes: 13 additions & 0 deletions python/chat_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import uvicorn

from llaisys.chat.server import build_runtime_from_env, create_app


def main() -> None:
runtime = build_runtime_from_env()
app = create_app(runtime)
uvicorn.run(app, host="0.0.0.0", port=9108)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions python/llaisys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .ops import Ops
from . import models
from .models import *
from .libllaisys import LIB_LLAISYS

__all__ = [
"RuntimeAPI",
Expand All @@ -17,4 +18,5 @@
"Tensor",
"Ops",
"models",
"LIB_LLAISYS"
]
3 changes: 3 additions & 0 deletions python/llaisys/chat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .server import create_app

__all__ = ["create_app"]
209 changes: 209 additions & 0 deletions python/llaisys/chat/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import os
import time
import json
import threading
from dataclasses import dataclass, field
from typing import Literal

from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from transformers import AutoTokenizer

from ..libllaisys import DeviceType
from ..models.qwen2 import Qwen2


class ChatMessage(BaseModel):
role: Literal["system", "user", "assistant"]
content: str


class ChatCompletionRequest(BaseModel):
model: str = Field(default="qwen2")
messages: list[ChatMessage]
max_tokens: int = Field(default=128, ge=1, le=4096)
temperature: float = Field(default=0.8, gt=0.0)
top_p: float = Field(default=0.9, gt=0.0, le=1.0)
top_k: int = Field(default=40, ge=0)
stream: bool = False


class ChoiceMessage(BaseModel):
role: Literal["assistant"] = "assistant"
content: str


class ChatCompletionChoice(BaseModel):
index: int
message: ChoiceMessage
finish_reason: Literal["stop", "length"]


class ChatCompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int


class ChatCompletionResponse(BaseModel):
id: str
object: Literal["chat.completion"] = "chat.completion"
created: int
model: str
choices: list[ChatCompletionChoice]
usage: ChatCompletionUsage


@dataclass
class ChatRuntime:
tokenizer: AutoTokenizer
model: Qwen2
lock: threading.Lock = field(default_factory=threading.Lock)


def _render_prompt(messages: list[ChatMessage], tokenizer: AutoTokenizer) -> str:
role_map = [{"role": m.role, "content": m.content} for m in messages]
if hasattr(tokenizer, "apply_chat_template"):
return tokenizer.apply_chat_template(
role_map,
tokenize=False,
add_generation_prompt=True,
)

chunks = []
for message in role_map:
chunks.append(f"<{message['role']}>\n{message['content']}\n")
chunks.append("<assistant>\n")
return "".join(chunks)


def _decode_new_text(tokenizer: AutoTokenizer, all_tokens: list[int], prompt_len: int) -> str:
return tokenizer.decode(all_tokens[prompt_len:], skip_special_tokens=True)


def _sse(event: dict) -> str:
return f"data: {json.dumps(event, ensure_ascii=False)}\n\n"


def create_app(runtime: ChatRuntime) -> FastAPI:
app = FastAPI(title="LLAISYS Chat Server", version="0.1.0")

@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
def chat_completions(req: ChatCompletionRequest):
if not req.messages:
raise HTTPException(status_code=400, detail="messages must not be empty")

prompt = _render_prompt(req.messages, runtime.tokenizer)
prompt_tokens = runtime.tokenizer.encode(prompt)

created = int(time.time())
response_id = f"chatcmpl-{int(time.time() * 1000)}"

if req.stream:
def event_stream():
with runtime.lock:
yield _sse({
"id": response_id,
"object": "chat.completion.chunk",
"created": created,
"model": req.model,
"choices": [{
"index": 0,
"delta": {"role": "assistant"},
"finish_reason": None,
}],
})

generated = []
last_text = ""
for token in runtime.model.generate_stream(
prompt_tokens,
max_new_tokens=req.max_tokens,
top_k=req.top_k,
top_p=req.top_p,
temperature=req.temperature,
):
generated.append(token)
current_text = runtime.tokenizer.decode(generated, skip_special_tokens=True)
delta_text = current_text[len(last_text):]
if not delta_text:
continue
last_text = current_text
yield _sse({
"id": response_id,
"object": "chat.completion.chunk",
"created": created,
"model": req.model,
"choices": [{
"index": 0,
"delta": {"content": delta_text},
"finish_reason": None,
}],
})

yield _sse({
"id": response_id,
"object": "chat.completion.chunk",
"created": created,
"model": req.model,
"choices": [{
"index": 0,
"delta": {},
"finish_reason": "stop",
}],
})
yield "data: [DONE]\n\n"

return StreamingResponse(event_stream(), media_type="text/event-stream")

with runtime.lock:
generated_tokens = runtime.model.generate(
prompt_tokens,
max_new_tokens=req.max_tokens,
top_k=req.top_k,
top_p=req.top_p,
temperature=req.temperature,
)

answer_text = _decode_new_text(runtime.tokenizer, generated_tokens, len(prompt_tokens))
completion_tokens = max(0, len(generated_tokens) - len(prompt_tokens))

return ChatCompletionResponse(
id=response_id,
created=created,
model=req.model,
choices=[
ChatCompletionChoice(
index=0,
message=ChoiceMessage(content=answer_text),
finish_reason="stop",
)
],
usage=ChatCompletionUsage(
prompt_tokens=len(prompt_tokens),
completion_tokens=completion_tokens,
total_tokens=len(prompt_tokens) + completion_tokens,
),
)

return app


def _parse_device(device_name: str) -> DeviceType:
if device_name.lower() == "nvidia":
return DeviceType.NVIDIA
return DeviceType.CPU


def build_runtime_from_env() -> ChatRuntime:
model_path = os.environ.get("LLAISYS_MODEL_PATH", "./data")
device = _parse_device(os.environ.get("LLAISYS_DEVICE", "cpu"))

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = Qwen2(model_path=model_path, device=device)
return ChatRuntime(tokenizer=tokenizer, model=model)
Loading